Skip to content

Commit c3f0797

Browse files
authored
Implement iteration cache, item cache and length cache for node list iteration (#11330)
* Implement iteration cache, item cache and length cache for node list iteration The current implementation follows the spec requirement that the list must be "live". This means that changes in the document must be reflected in the existing node lists without requiring the user to refetch the node list. The consequence is that getting any item, or the length of the list, always starts searching from the root element of the node list. This results in O(n) time to get any item or the length. If there's a for loop over the node list, this means the iterations will take O(n²) time in total. This causes real-world performance issues with potential for downtime (see GH-11308 and its references for details). We fix this by introducing a caching strategy. We cache the last iterated object in the iterator, the last requested item in the node list, and the last length computation. To invalidate the cache, we simply count the number of modifications made to the containing document. If the modification number does not match what the number was during caching, we know the document has been modified and the cache is invalid. If this ever overflows, we saturate the modification number and don't do any caching anymore. Note that we don't check for overflow on 64-bit systems because it would take hundreds of years to overflow. Fixes GH-11308.
1 parent ce724d1 commit c3f0797

24 files changed

+762
-64
lines changed

NEWS

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,9 @@ PHP NEWS
4444
- Date:
4545
. Implement More Appropriate Date/Time Exceptions RFC. (Derick)
4646

47+
- DOM:
48+
. Fix bug GH-11308 (getElementsByTagName() is O(N^2)). (nielsdos)
49+
4750
- Exif:
4851
. Removed unneeded codepaths in exif_process_TIFF_in_JPEG(). (nielsdos)
4952

UPGRADING.INTERNALS

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -120,6 +120,24 @@ PHP 8.3 INTERNALS UPGRADE NOTES
120120
- A new function dom_get_doc_props_read_only() is added to gather the document
121121
properties in a read-only way. This function avoids allocation when there are
122122
no document properties changed yet.
123+
- The node list returned by DOMNode::getElementsByTagName() and
124+
DOMNode::getElementsByTagNameNS() now caches the length and the last requested item.
125+
This means that the length and the last requested item are not recalculated
126+
when the node list is iterated over multiple times.
127+
If you do not use the internal PHP dom APIs to modify the document, you need to
128+
manually invalidate the cache using php_libxml_invalidate_node_list_cache_from_doc().
129+
Furthermore, the following internal APIs were added to handle the cache:
130+
. php_dom_is_cache_tag_stale_from_doc_ptr()
131+
. php_dom_is_cache_tag_stale_from_node()
132+
. php_dom_mark_cache_tag_up_to_date_from_node()
133+
- The function dom_get_elements_by_tag_name_ns_raw() has an additional parameter to indicate
134+
the base node of the node list. This function also no longer accepts -1 as the index argument.
135+
- The function dom_namednode_iter() has additional arguments to avoid recomputing the length of
136+
the strings.
137+
138+
g. ext/libxml
139+
- Two new functions: php_libxml_invalidate_node_list_cache_from_doc() and
140+
php_libxml_invalidate_node_list_cache() were added to invalidate the cache of a node list.
123141

124142
========================
125143
4. OpCode changes

ext/dom/document.c

Lines changed: 28 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -777,7 +777,6 @@ PHP_METHOD(DOMDocument, getElementsByTagName)
777777
size_t name_len;
778778
dom_object *intern, *namednode;
779779
char *name;
780-
xmlChar *local;
781780

782781
id = ZEND_THIS;
783782
if (zend_parse_parameters(ZEND_NUM_ARGS(), "s", &name, &name_len) == FAILURE) {
@@ -788,8 +787,7 @@ PHP_METHOD(DOMDocument, getElementsByTagName)
788787

789788
php_dom_create_iterator(return_value, DOM_NODELIST);
790789
namednode = Z_DOMOBJ_P(return_value);
791-
local = xmlCharStrndup(name, name_len);
792-
dom_namednode_iter(intern, 0, namednode, NULL, local, NULL);
790+
dom_namednode_iter(intern, 0, namednode, NULL, name, name_len, NULL, 0);
793791
}
794792
/* }}} end dom_document_get_elements_by_tag_name */
795793

@@ -847,6 +845,8 @@ PHP_METHOD(DOMDocument, importNode)
847845
}
848846
}
849847

848+
php_libxml_invalidate_node_list_cache_from_doc(docp);
849+
850850
DOM_RET_OBJ((xmlNodePtr) retnodep, &ret, intern);
851851
}
852852
/* }}} end dom_document_import_node */
@@ -991,7 +991,6 @@ PHP_METHOD(DOMDocument, getElementsByTagNameNS)
991991
size_t uri_len, name_len;
992992
dom_object *intern, *namednode;
993993
char *uri, *name;
994-
xmlChar *local, *nsuri;
995994

996995
id = ZEND_THIS;
997996
if (zend_parse_parameters(ZEND_NUM_ARGS(), "s!s", &uri, &uri_len, &name, &name_len) == FAILURE) {
@@ -1002,9 +1001,7 @@ PHP_METHOD(DOMDocument, getElementsByTagNameNS)
10021001

10031002
php_dom_create_iterator(return_value, DOM_NODELIST);
10041003
namednode = Z_DOMOBJ_P(return_value);
1005-
local = xmlCharStrndup(name, name_len);
1006-
nsuri = xmlCharStrndup(uri ? uri : "", uri_len);
1007-
dom_namednode_iter(intern, 0, namednode, NULL, local, nsuri);
1004+
dom_namednode_iter(intern, 0, namednode, NULL, name, name_len, uri ? uri : "", uri_len);
10081005
}
10091006
/* }}} end dom_document_get_elements_by_tag_name_ns */
10101007

@@ -1070,6 +1067,8 @@ PHP_METHOD(DOMDocument, normalizeDocument)
10701067

10711068
DOM_GET_OBJ(docp, id, xmlDocPtr, intern);
10721069

1070+
php_libxml_invalidate_node_list_cache_from_doc(docp);
1071+
10731072
dom_normalize((xmlNodePtr) docp);
10741073
}
10751074
/* }}} end dom_document_normalize_document */
@@ -1328,10 +1327,14 @@ static void dom_parse_document(INTERNAL_FUNCTION_PARAMETERS, int mode) {
13281327

13291328
if (id != NULL) {
13301329
intern = Z_DOMOBJ_P(id);
1330+
size_t old_modification_nr = 0;
13311331
if (intern != NULL) {
13321332
docp = (xmlDocPtr) dom_object_get_node(intern);
13331333
doc_prop = NULL;
13341334
if (docp != NULL) {
1335+
const php_libxml_doc_ptr *doc_ptr = docp->_private;
1336+
ZEND_ASSERT(doc_ptr != NULL); /* Must exist, we have a document */
1337+
old_modification_nr = doc_ptr->cache_tag.modification_nr;
13351338
php_libxml_decrement_node_ptr((php_libxml_node_object *) intern);
13361339
doc_prop = intern->document->doc_props;
13371340
intern->document->doc_props = NULL;
@@ -1348,6 +1351,12 @@ static void dom_parse_document(INTERNAL_FUNCTION_PARAMETERS, int mode) {
13481351
}
13491352

13501353
php_libxml_increment_node_ptr((php_libxml_node_object *)intern, (xmlNodePtr)newdoc, (void *)intern);
1354+
/* Since iterators should invalidate, we need to start the modification number from the old counter */
1355+
if (old_modification_nr != 0) {
1356+
php_libxml_doc_ptr* doc_ptr = (php_libxml_doc_ptr*) ((php_libxml_node_object*) intern)->node; /* downcast */
1357+
doc_ptr->cache_tag.modification_nr = old_modification_nr;
1358+
php_libxml_invalidate_node_list_cache(doc_ptr);
1359+
}
13511360

13521361
RETURN_TRUE;
13531362
} else {
@@ -1563,6 +1572,8 @@ PHP_METHOD(DOMDocument, xinclude)
15631572
php_dom_remove_xinclude_nodes(root);
15641573
}
15651574

1575+
php_libxml_invalidate_node_list_cache_from_doc(docp);
1576+
15661577
if (err) {
15671578
RETVAL_LONG(err);
15681579
} else {
@@ -1871,10 +1882,14 @@ static void dom_load_html(INTERNAL_FUNCTION_PARAMETERS, int mode) /* {{{ */
18711882

18721883
if (id != NULL && instanceof_function(Z_OBJCE_P(id), dom_document_class_entry)) {
18731884
intern = Z_DOMOBJ_P(id);
1885+
size_t old_modification_nr = 0;
18741886
if (intern != NULL) {
18751887
docp = (xmlDocPtr) dom_object_get_node(intern);
18761888
doc_prop = NULL;
18771889
if (docp != NULL) {
1890+
const php_libxml_doc_ptr *doc_ptr = docp->_private;
1891+
ZEND_ASSERT(doc_ptr != NULL); /* Must exist, we have a document */
1892+
old_modification_nr = doc_ptr->cache_tag.modification_nr;
18781893
php_libxml_decrement_node_ptr((php_libxml_node_object *) intern);
18791894
doc_prop = intern->document->doc_props;
18801895
intern->document->doc_props = NULL;
@@ -1891,6 +1906,12 @@ static void dom_load_html(INTERNAL_FUNCTION_PARAMETERS, int mode) /* {{{ */
18911906
}
18921907

18931908
php_libxml_increment_node_ptr((php_libxml_node_object *)intern, (xmlNodePtr)newdoc, (void *)intern);
1909+
/* Since iterators should invalidate, we need to start the modification number from the old counter */
1910+
if (old_modification_nr != 0) {
1911+
php_libxml_doc_ptr* doc_ptr = (php_libxml_doc_ptr*) ((php_libxml_node_object*) intern)->node; /* downcast */
1912+
doc_ptr->cache_tag.modification_nr = old_modification_nr;
1913+
php_libxml_invalidate_node_list_cache(doc_ptr);
1914+
}
18941915

18951916
RETURN_TRUE;
18961917
} else {

ext/dom/documenttype.c

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -65,7 +65,7 @@ int dom_documenttype_entities_read(dom_object *obj, zval *retval)
6565
entityht = (xmlHashTable *) doctypep->entities;
6666

6767
intern = Z_DOMOBJ_P(retval);
68-
dom_namednode_iter(obj, XML_ENTITY_NODE, intern, entityht, NULL, NULL);
68+
dom_namednode_iter(obj, XML_ENTITY_NODE, intern, entityht, NULL, 0, NULL, 0);
6969

7070
return SUCCESS;
7171
}
@@ -93,7 +93,7 @@ int dom_documenttype_notations_read(dom_object *obj, zval *retval)
9393
notationht = (xmlHashTable *) doctypep->notations;
9494

9595
intern = Z_DOMOBJ_P(retval);
96-
dom_namednode_iter(obj, XML_NOTATION_NODE, intern, notationht, NULL, NULL);
96+
dom_namednode_iter(obj, XML_NOTATION_NODE, intern, notationht, NULL, 0, NULL, 0);
9797

9898
return SUCCESS;
9999
}

ext/dom/dom_iterators.c

Lines changed: 27 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -179,7 +179,7 @@ static void php_dom_iterator_move_forward(zend_object_iterator *iter) /* {{{ */
179179
dom_object *intern;
180180
dom_object *nnmap;
181181
dom_nnodemap_object *objmap;
182-
int previndex=0;
182+
int previndex;
183183
HashTable *nodeht;
184184
zval *entry;
185185
bool do_curobj_undef = 1;
@@ -205,23 +205,32 @@ static void php_dom_iterator_move_forward(zend_object_iterator *iter) /* {{{ */
205205
do_curobj_undef = 0;
206206
}
207207
} else {
208-
curnode = (xmlNodePtr)((php_libxml_node_ptr *)intern->ptr)->node;
209208
if (objmap->nodetype == XML_ATTRIBUTE_NODE ||
210209
objmap->nodetype == XML_ELEMENT_NODE) {
210+
curnode = (xmlNodePtr)((php_libxml_node_ptr *)intern->ptr)->node;
211211
curnode = curnode->next;
212212
} else {
213-
/* Nav the tree evey time as this is LIVE */
213+
/* The collection is live, we nav the tree from the base object if we cannot
214+
* use the cache to restart from the last point. */
214215
basenode = dom_object_get_node(objmap->baseobj);
215-
if (basenode && (basenode->type == XML_DOCUMENT_NODE ||
216-
basenode->type == XML_HTML_DOCUMENT_NODE)) {
217-
basenode = xmlDocGetRootElement((xmlDoc *) basenode);
218-
} else if (basenode) {
219-
basenode = basenode->children;
220-
} else {
216+
if (UNEXPECTED(!basenode)) {
221217
goto err;
222218
}
219+
if (php_dom_is_cache_tag_stale_from_node(&iterator->cache_tag, basenode)) {
220+
php_dom_mark_cache_tag_up_to_date_from_node(&iterator->cache_tag, basenode);
221+
previndex = 0;
222+
if (basenode && (basenode->type == XML_DOCUMENT_NODE ||
223+
basenode->type == XML_HTML_DOCUMENT_NODE)) {
224+
curnode = xmlDocGetRootElement((xmlDoc *) basenode);
225+
} else {
226+
curnode = basenode->children;
227+
}
228+
} else {
229+
previndex = iter->index - 1;
230+
curnode = (xmlNodePtr)((php_libxml_node_ptr *)intern->ptr)->node;
231+
}
223232
curnode = dom_get_elements_by_tag_name_ns_raw(
224-
basenode, (char *) objmap->ns, (char *) objmap->local, &previndex, iter->index);
233+
basenode, curnode, (char *) objmap->ns, (char *) objmap->local, &previndex, iter->index);
225234
}
226235
}
227236
} else {
@@ -258,7 +267,7 @@ zend_object_iterator *php_dom_get_iterator(zend_class_entry *ce, zval *object, i
258267
{
259268
dom_object *intern;
260269
dom_nnodemap_object *objmap;
261-
xmlNodePtr nodep, curnode=NULL;
270+
xmlNodePtr curnode=NULL;
262271
int curindex = 0;
263272
HashTable *nodeht;
264273
zval *entry;
@@ -270,6 +279,7 @@ zend_object_iterator *php_dom_get_iterator(zend_class_entry *ce, zval *object, i
270279
}
271280
iterator = emalloc(sizeof(php_dom_iterator));
272281
zend_iterator_init(&iterator->intern);
282+
iterator->cache_tag.modification_nr = 0;
273283

274284
ZVAL_OBJ_COPY(&iterator->intern.data, Z_OBJ_P(object));
275285
iterator->intern.funcs = &php_dom_iterator_funcs;
@@ -288,24 +298,25 @@ zend_object_iterator *php_dom_get_iterator(zend_class_entry *ce, zval *object, i
288298
ZVAL_COPY(&iterator->curobj, entry);
289299
}
290300
} else {
291-
nodep = (xmlNode *)dom_object_get_node(objmap->baseobj);
292-
if (!nodep) {
301+
xmlNodePtr basep = (xmlNode *)dom_object_get_node(objmap->baseobj);
302+
if (!basep) {
293303
goto err;
294304
}
295305
if (objmap->nodetype == XML_ATTRIBUTE_NODE || objmap->nodetype == XML_ELEMENT_NODE) {
296306
if (objmap->nodetype == XML_ATTRIBUTE_NODE) {
297-
curnode = (xmlNodePtr) nodep->properties;
307+
curnode = (xmlNodePtr) basep->properties;
298308
} else {
299-
curnode = (xmlNodePtr) nodep->children;
309+
curnode = (xmlNodePtr) basep->children;
300310
}
301311
} else {
312+
xmlNodePtr nodep = basep;
302313
if (nodep->type == XML_DOCUMENT_NODE || nodep->type == XML_HTML_DOCUMENT_NODE) {
303314
nodep = xmlDocGetRootElement((xmlDoc *) nodep);
304315
} else {
305316
nodep = nodep->children;
306317
}
307318
curnode = dom_get_elements_by_tag_name_ns_raw(
308-
nodep, (char *) objmap->ns, (char *) objmap->local, &curindex, 0);
319+
basep, nodep, (char *) objmap->ns, (char *) objmap->local, &curindex, 0);
309320
}
310321
}
311322
} else {

ext/dom/element.c

Lines changed: 2 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -511,7 +511,6 @@ PHP_METHOD(DOMElement, getElementsByTagName)
511511
size_t name_len;
512512
dom_object *intern, *namednode;
513513
char *name;
514-
xmlChar *local;
515514

516515
id = ZEND_THIS;
517516
if (zend_parse_parameters(ZEND_NUM_ARGS(), "s", &name, &name_len) == FAILURE) {
@@ -522,8 +521,7 @@ PHP_METHOD(DOMElement, getElementsByTagName)
522521

523522
php_dom_create_iterator(return_value, DOM_NODELIST);
524523
namednode = Z_DOMOBJ_P(return_value);
525-
local = xmlCharStrndup(name, name_len);
526-
dom_namednode_iter(intern, 0, namednode, NULL, local, NULL);
524+
dom_namednode_iter(intern, 0, namednode, NULL, name, name_len, NULL, 0);
527525
}
528526
/* }}} end dom_element_get_elements_by_tag_name */
529527

@@ -930,7 +928,6 @@ PHP_METHOD(DOMElement, getElementsByTagNameNS)
930928
size_t uri_len, name_len;
931929
dom_object *intern, *namednode;
932930
char *uri, *name;
933-
xmlChar *local, *nsuri;
934931

935932
id = ZEND_THIS;
936933
if (zend_parse_parameters(ZEND_NUM_ARGS(), "s!s", &uri, &uri_len, &name, &name_len) == FAILURE) {
@@ -941,9 +938,7 @@ PHP_METHOD(DOMElement, getElementsByTagNameNS)
941938

942939
php_dom_create_iterator(return_value, DOM_NODELIST);
943940
namednode = Z_DOMOBJ_P(return_value);
944-
local = xmlCharStrndup(name, name_len);
945-
nsuri = xmlCharStrndup(uri ? uri : "", uri_len);
946-
dom_namednode_iter(intern, 0, namednode, NULL, local, nsuri);
941+
dom_namednode_iter(intern, 0, namednode, NULL, name, name_len, uri ? uri : "", uri_len);
947942

948943
}
949944
/* }}} end dom_element_get_elements_by_tag_name_ns */

ext/dom/node.c

Lines changed: 16 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -195,6 +195,8 @@ int dom_node_node_value_write(dom_object *obj, zval *newval)
195195
break;
196196
}
197197

198+
php_libxml_invalidate_node_list_cache_from_doc(nodep->doc);
199+
198200
zend_string_release_ex(str, 0);
199201
return SUCCESS;
200202
}
@@ -274,7 +276,7 @@ int dom_node_child_nodes_read(dom_object *obj, zval *retval)
274276

275277
php_dom_create_iterator(retval, DOM_NODELIST);
276278
intern = Z_DOMOBJ_P(retval);
277-
dom_namednode_iter(obj, XML_ELEMENT_NODE, intern, NULL, NULL, NULL);
279+
dom_namednode_iter(obj, XML_ELEMENT_NODE, intern, NULL, NULL, 0, NULL, 0);
278280

279281
return SUCCESS;
280282
}
@@ -482,7 +484,7 @@ int dom_node_attributes_read(dom_object *obj, zval *retval)
482484
if (nodep->type == XML_ELEMENT_NODE) {
483485
php_dom_create_iterator(retval, DOM_NAMEDNODEMAP);
484486
intern = Z_DOMOBJ_P(retval);
485-
dom_namednode_iter(obj, XML_ATTRIBUTE_NODE, intern, NULL, NULL, NULL);
487+
dom_namednode_iter(obj, XML_ATTRIBUTE_NODE, intern, NULL, NULL, 0, NULL, 0);
486488
} else {
487489
ZVAL_NULL(retval);
488490
}
@@ -769,6 +771,8 @@ int dom_node_text_content_write(dom_object *obj, zval *newval)
769771
return FAILURE;
770772
}
771773

774+
php_libxml_invalidate_node_list_cache_from_doc(nodep->doc);
775+
772776
const xmlChar *xmlChars = (const xmlChar *) ZSTR_VAL(str);
773777
int type = nodep->type;
774778

@@ -897,6 +901,8 @@ PHP_METHOD(DOMNode, insertBefore)
897901
php_libxml_increment_doc_ref((php_libxml_node_object *)childobj, NULL);
898902
}
899903

904+
php_libxml_invalidate_node_list_cache_from_doc(parentp->doc);
905+
900906
if (ref != NULL) {
901907
DOM_GET_OBJ(refp, ref, xmlNodePtr, refpobj);
902908
if (refp->parent != parentp) {
@@ -1086,6 +1092,7 @@ PHP_METHOD(DOMNode, replaceChild)
10861092
nodep->doc->intSubset = (xmlDtd *) newchild;
10871093
}
10881094
}
1095+
php_libxml_invalidate_node_list_cache_from_doc(nodep->doc);
10891096
DOM_RET_OBJ(oldchild, &ret, intern);
10901097
}
10911098
/* }}} end dom_node_replace_child */
@@ -1127,6 +1134,7 @@ PHP_METHOD(DOMNode, removeChild)
11271134
}
11281135

11291136
xmlUnlinkNode(child);
1137+
php_libxml_invalidate_node_list_cache_from_doc(nodep->doc);
11301138
DOM_RET_OBJ(child, &ret, intern);
11311139
}
11321140
/* }}} end dom_node_remove_child */
@@ -1230,6 +1238,8 @@ PHP_METHOD(DOMNode, appendChild)
12301238

12311239
dom_reconcile_ns(nodep->doc, new_child);
12321240

1241+
php_libxml_invalidate_node_list_cache_from_doc(nodep->doc);
1242+
12331243
DOM_RET_OBJ(new_child, &ret, intern);
12341244
}
12351245
/* }}} end dom_node_append_child */
@@ -1339,6 +1349,8 @@ PHP_METHOD(DOMNode, normalize)
13391349

13401350
DOM_GET_OBJ(nodep, id, xmlNodePtr, intern);
13411351

1352+
php_libxml_invalidate_node_list_cache_from_doc(nodep->doc);
1353+
13421354
dom_normalize(nodep);
13431355

13441356
}
@@ -1571,6 +1583,8 @@ static void dom_canonicalization(INTERNAL_FUNCTION_PARAMETERS, int mode) /* {{{
15711583
RETURN_THROWS();
15721584
}
15731585

1586+
php_libxml_invalidate_node_list_cache_from_doc(docp);
1587+
15741588
if (xpath_array == NULL) {
15751589
if (nodep->type != XML_DOCUMENT_NODE) {
15761590
ctxp = xmlXPathNewContext(docp);

0 commit comments

Comments
 (0)