Skip to content

Commit c4f8ff6

Browse files
committed
Implement iteration cache, item cache and length cache for node list iteration
The current implementation follows the spec requirement that the list must be "live". This means that changes in the document must be reflected in the existing node lists without requiring the user to refetch the node list. The consequence is that getting any item, or the length of the list, always starts searching from the root element of the node list. This results in O(n) time to get any item or the length. If there's a for loop over the node list, this means the iterations will take O(n²) time in total. This causes real-world performance issues with potential for downtime (see GH-11308 and its references for details). We fix this by introducing a caching strategy. We cache the last iterated object in the iterator, the last requested item in the node list, and the last length computation. To invalidate the cache, we simply count the number of modifications made to the containing document. If the modification number does not match what the number was during caching, we know the document has been modified and the cache is invalid. If this ever overflows, we saturate the modification number and don't do any caching anymore. Note that we don't check for overflow on 64-bit systems because it would take hundreds of years to overflow. Fixes GH-11308.
1 parent fbe6696 commit c4f8ff6

15 files changed

+398
-22
lines changed

UPGRADING.INTERNALS

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -116,6 +116,14 @@ PHP 8.3 INTERNALS UPGRADE NOTES
116116
- The PHPAPI spl_iterator_apply() function now returns zend_result instead of int.
117117
There are no functional changes.
118118

119+
f. ext/dom
120+
- The node list returned by DOMNode::getElementsByTagName() and
121+
DOMNode::getElementsByTagNameNS() now caches the length and the last requested item.
122+
This means that the length and the last requested item are not recalculated
123+
when the node list is iterated over multiple times.
124+
If you do not use the internal PHP dom APIs to modify the document, you need to
125+
manually invalidate the cache using php_dom_invalidate_node_list_cache().
126+
119127
========================
120128
4. OpCode changes
121129
========================

ext/dom/document.c

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -847,6 +847,8 @@ PHP_METHOD(DOMDocument, importNode)
847847
}
848848
}
849849

850+
php_dom_invalidate_node_list_cache(docp);
851+
850852
DOM_RET_OBJ((xmlNodePtr) retnodep, &ret, intern);
851853
}
852854
/* }}} end dom_document_import_node */
@@ -1070,6 +1072,8 @@ PHP_METHOD(DOMDocument, normalizeDocument)
10701072

10711073
DOM_GET_OBJ(docp, id, xmlDocPtr, intern);
10721074

1075+
php_dom_invalidate_node_list_cache(docp);
1076+
10731077
dom_normalize((xmlNodePtr) docp);
10741078
}
10751079
/* }}} end dom_document_normalize_document */
@@ -1333,10 +1337,14 @@ static void dom_parse_document(INTERNAL_FUNCTION_PARAMETERS, int mode) {
13331337

13341338
if (id != NULL) {
13351339
intern = Z_DOMOBJ_P(id);
1340+
size_t old_modification_nr = 0;
13361341
if (intern != NULL) {
13371342
docp = (xmlDocPtr) dom_object_get_node(intern);
13381343
doc_prop = NULL;
13391344
if (docp != NULL) {
1345+
const php_libxml_doc_ptr *doc_ptr = docp->_private;
1346+
ZEND_ASSERT(doc_ptr != NULL); /* Must exist, we have a document */
1347+
old_modification_nr = doc_ptr->cache_tag.modification_nr;
13401348
php_libxml_decrement_node_ptr((php_libxml_node_object *) intern);
13411349
doc_prop = intern->document->doc_props;
13421350
intern->document->doc_props = NULL;
@@ -1353,6 +1361,12 @@ static void dom_parse_document(INTERNAL_FUNCTION_PARAMETERS, int mode) {
13531361
}
13541362

13551363
php_libxml_increment_node_ptr((php_libxml_node_object *)intern, (xmlNodePtr)newdoc, (void *)intern);
1364+
/* Since iterators should invalidate, we need to start the modification number from the old counter */
1365+
if (old_modification_nr != 0) {
1366+
php_libxml_doc_ptr* doc_ptr = (php_libxml_doc_ptr*) ((php_libxml_node_object*) intern)->node; /* downcast */
1367+
doc_ptr->cache_tag.modification_nr = old_modification_nr;
1368+
php_libxml_invalidate_node_list_cache(doc_ptr);
1369+
}
13561370

13571371
RETURN_TRUE;
13581372
} else {
@@ -1570,6 +1584,8 @@ PHP_METHOD(DOMDocument, xinclude)
15701584
php_dom_remove_xinclude_nodes(root);
15711585
}
15721586

1587+
php_dom_invalidate_node_list_cache(docp);
1588+
15731589
if (err) {
15741590
RETVAL_LONG(err);
15751591
} else {
@@ -1878,10 +1894,14 @@ static void dom_load_html(INTERNAL_FUNCTION_PARAMETERS, int mode) /* {{{ */
18781894

18791895
if (id != NULL && instanceof_function(Z_OBJCE_P(id), dom_document_class_entry)) {
18801896
intern = Z_DOMOBJ_P(id);
1897+
size_t old_modification_nr = 0;
18811898
if (intern != NULL) {
18821899
docp = (xmlDocPtr) dom_object_get_node(intern);
18831900
doc_prop = NULL;
18841901
if (docp != NULL) {
1902+
const php_libxml_doc_ptr *doc_ptr = docp->_private;
1903+
ZEND_ASSERT(doc_ptr != NULL); /* Must exist, we have a document */
1904+
old_modification_nr = doc_ptr->cache_tag.modification_nr;
18851905
php_libxml_decrement_node_ptr((php_libxml_node_object *) intern);
18861906
doc_prop = intern->document->doc_props;
18871907
intern->document->doc_props = NULL;
@@ -1898,6 +1918,12 @@ static void dom_load_html(INTERNAL_FUNCTION_PARAMETERS, int mode) /* {{{ */
18981918
}
18991919

19001920
php_libxml_increment_node_ptr((php_libxml_node_object *)intern, (xmlNodePtr)newdoc, (void *)intern);
1921+
/* Since iterators should invalidate, we need to start the modification number from the old counter */
1922+
if (old_modification_nr != 0) {
1923+
php_libxml_doc_ptr* doc_ptr = (php_libxml_doc_ptr*) ((php_libxml_node_object*) intern)->node; /* downcast */
1924+
doc_ptr->cache_tag.modification_nr = old_modification_nr;
1925+
php_libxml_invalidate_node_list_cache(doc_ptr);
1926+
}
19011927

19021928
RETURN_TRUE;
19031929
} else {

ext/dom/dom_iterators.c

Lines changed: 16 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -179,7 +179,7 @@ static void php_dom_iterator_move_forward(zend_object_iterator *iter) /* {{{ */
179179
dom_object *intern;
180180
dom_object *nnmap;
181181
dom_nnodemap_object *objmap;
182-
int previndex=0;
182+
int previndex;
183183
HashTable *nodeht;
184184
zval *entry;
185185
bool do_curobj_undef = 1;
@@ -205,20 +205,27 @@ static void php_dom_iterator_move_forward(zend_object_iterator *iter) /* {{{ */
205205
do_curobj_undef = 0;
206206
}
207207
} else {
208-
curnode = (xmlNodePtr)((php_libxml_node_ptr *)intern->ptr)->node;
209208
if (objmap->nodetype == XML_ATTRIBUTE_NODE ||
210209
objmap->nodetype == XML_ELEMENT_NODE) {
210+
curnode = (xmlNodePtr)((php_libxml_node_ptr *)intern->ptr)->node;
211211
curnode = curnode->next;
212212
} else {
213213
/* Nav the tree evey time as this is LIVE */
214214
basenode = dom_object_get_node(objmap->baseobj);
215-
if (basenode && (basenode->type == XML_DOCUMENT_NODE ||
216-
basenode->type == XML_HTML_DOCUMENT_NODE)) {
217-
basenode = xmlDocGetRootElement((xmlDoc *) basenode);
218-
} else if (basenode) {
219-
basenode = basenode->children;
215+
if (php_dom_is_cache_tag_stale_from_node(&iterator->cache_tag, basenode)) {
216+
if (basenode && (basenode->type == XML_DOCUMENT_NODE ||
217+
basenode->type == XML_HTML_DOCUMENT_NODE)) {
218+
basenode = xmlDocGetRootElement((xmlDoc *) basenode);
219+
} else if (basenode) {
220+
basenode = basenode->children;
221+
} else {
222+
goto err;
223+
}
224+
php_dom_mark_cache_tag_up_to_date_from_node(&iterator->cache_tag, basenode);
225+
previndex = 0;
220226
} else {
221-
goto err;
227+
previndex = iter->index - 1;
228+
basenode = (xmlNodePtr)((php_libxml_node_ptr *)intern->ptr)->node;
222229
}
223230
curnode = dom_get_elements_by_tag_name_ns_raw(
224231
basenode, (char *) objmap->ns, (char *) objmap->local, &previndex, iter->index);
@@ -270,6 +277,7 @@ zend_object_iterator *php_dom_get_iterator(zend_class_entry *ce, zval *object, i
270277
}
271278
iterator = emalloc(sizeof(php_dom_iterator));
272279
zend_iterator_init(&iterator->intern);
280+
iterator->cache_tag.modification_nr = 0;
273281

274282
ZVAL_OBJ_COPY(&iterator->intern.data, Z_OBJ_P(object));
275283
iterator->intern.funcs = &php_dom_iterator_funcs;

ext/dom/node.c

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -769,6 +769,8 @@ int dom_node_text_content_write(dom_object *obj, zval *newval)
769769
return FAILURE;
770770
}
771771

772+
php_dom_invalidate_node_list_cache(nodep->doc);
773+
772774
if (nodep->type == XML_ELEMENT_NODE || nodep->type == XML_ATTRIBUTE_NODE) {
773775
if (nodep->children) {
774776
node_list_unlink(nodep->children);
@@ -886,6 +888,8 @@ PHP_METHOD(DOMNode, insertBefore)
886888
php_libxml_increment_doc_ref((php_libxml_node_object *)childobj, NULL);
887889
}
888890

891+
php_dom_invalidate_node_list_cache(parentp->doc);
892+
889893
if (ref != NULL) {
890894
DOM_GET_OBJ(refp, ref, xmlNodePtr, refpobj);
891895
if (refp->parent != parentp) {
@@ -1075,6 +1079,7 @@ PHP_METHOD(DOMNode, replaceChild)
10751079
nodep->doc->intSubset = (xmlDtd *) newchild;
10761080
}
10771081
}
1082+
php_dom_invalidate_node_list_cache(nodep->doc);
10781083
DOM_RET_OBJ(oldchild, &ret, intern);
10791084
}
10801085
/* }}} end dom_node_replace_child */
@@ -1116,6 +1121,7 @@ PHP_METHOD(DOMNode, removeChild)
11161121
}
11171122

11181123
xmlUnlinkNode(child);
1124+
php_dom_invalidate_node_list_cache(nodep->doc);
11191125
DOM_RET_OBJ(child, &ret, intern);
11201126
}
11211127
/* }}} end dom_node_remove_child */
@@ -1219,6 +1225,8 @@ PHP_METHOD(DOMNode, appendChild)
12191225

12201226
dom_reconcile_ns(nodep->doc, new_child);
12211227

1228+
php_dom_invalidate_node_list_cache(nodep->doc);
1229+
12221230
DOM_RET_OBJ(new_child, &ret, intern);
12231231
}
12241232
/* }}} end dom_node_append_child */
@@ -1328,6 +1336,8 @@ PHP_METHOD(DOMNode, normalize)
13281336

13291337
DOM_GET_OBJ(nodep, id, xmlNodePtr, intern);
13301338

1339+
php_dom_invalidate_node_list_cache(nodep->doc);
1340+
13311341
dom_normalize(nodep);
13321342

13331343
}
@@ -1560,6 +1570,8 @@ static void dom_canonicalization(INTERNAL_FUNCTION_PARAMETERS, int mode) /* {{{
15601570
RETURN_THROWS();
15611571
}
15621572

1573+
php_dom_invalidate_node_list_cache(docp);
1574+
15631575
if (xpath_array == NULL) {
15641576
if (nodep->type != XML_DOCUMENT_NODE) {
15651577
ctxp = xmlXPathNewContext(docp);

ext/dom/nodelist.c

Lines changed: 81 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,23 @@
3131
* Since:
3232
*/
3333

34+
static zend_always_inline void objmap_cache_release_cached_obj(dom_nnodemap_object *objmap)
35+
{
36+
if (objmap->cached_obj) {
37+
if (GC_DELREF(&objmap->cached_obj->std) == 0) {
38+
zend_objects_store_del(&objmap->cached_obj->std);
39+
}
40+
objmap->cached_obj = NULL;
41+
objmap->cached_obj_index = 0;
42+
}
43+
}
44+
45+
static zend_always_inline void reset_objmap_cache(dom_nnodemap_object *objmap)
46+
{
47+
objmap_cache_release_cached_obj(objmap);
48+
objmap->cached_length = -1;
49+
}
50+
3451
static int get_nodelist_length(dom_object *obj)
3552
{
3653
dom_nnodemap_object *objmap = (dom_nnodemap_object *) obj->ptr;
@@ -52,6 +69,17 @@ static int get_nodelist_length(dom_object *obj)
5269
return 0;
5370
}
5471

72+
if (!php_dom_is_cache_tag_stale_from_node(&objmap->cache_tag, nodep)) {
73+
if (objmap->cached_length >= 0) {
74+
return objmap->cached_length;
75+
}
76+
/* Only the length is out-of-date, the cache tag is still valid.
77+
* Therefore, only overwrite the length and keep the currently cached object. */
78+
} else {
79+
php_dom_mark_cache_tag_up_to_date_from_node(&objmap->cache_tag, nodep);
80+
reset_objmap_cache(objmap);
81+
}
82+
5583
int count = 0;
5684
if (objmap->nodetype == XML_ATTRIBUTE_NODE || objmap->nodetype == XML_ELEMENT_NODE) {
5785
xmlNodePtr curnode = nodep->children;
@@ -72,6 +100,8 @@ static int get_nodelist_length(dom_object *obj)
72100
nodep, (char *) objmap->ns, (char *) objmap->local, &count, -1);
73101
}
74102

103+
objmap->cached_length = count;
104+
75105
return count;
76106
}
77107

@@ -113,11 +143,12 @@ PHP_METHOD(DOMNodeList, item)
113143
zval *id;
114144
zend_long index;
115145
int ret;
146+
bool cache_itemnode = false;
116147
dom_object *intern;
117148
xmlNodePtr itemnode = NULL;
118149

119150
dom_nnodemap_object *objmap;
120-
xmlNodePtr nodep, curnode;
151+
xmlNodePtr nodep;
121152
int count = 0;
122153

123154
id = ZEND_THIS;
@@ -147,28 +178,68 @@ PHP_METHOD(DOMNodeList, item)
147178
} else if (objmap->baseobj) {
148179
nodep = dom_object_get_node(objmap->baseobj);
149180
if (nodep) {
181+
/* For now we're only able to use cache for forward search.
182+
* TODO: in the future we could extend the logic of the node list such that backwards searches
183+
* are also possible. */
184+
bool restart = true;
185+
int relative_index = index;
186+
if (index >= objmap->cached_obj_index && objmap->cached_obj && !php_dom_is_cache_tag_stale_from_node(&objmap->cache_tag, nodep)) {
187+
nodep = dom_object_get_node(objmap->cached_obj);
188+
/* The node cannot be NULL if the cache is valid. If it is NULL, then it means we
189+
* forgot an invalidation somewhere. Take the defensive programming approach and invalidate
190+
* it here if it's NULL (except in debug mode where we would want to catch this). */
191+
if (UNEXPECTED(nodep == NULL)) {
192+
#if ZEND_DEBUG
193+
ZEND_UNREACHABLE();
194+
#endif
195+
reset_objmap_cache(objmap);
196+
} else {
197+
restart = false;
198+
relative_index -= objmap->cached_obj_index;
199+
}
200+
}
150201
if (objmap->nodetype == XML_ATTRIBUTE_NODE || objmap->nodetype == XML_ELEMENT_NODE) {
151-
curnode = nodep->children;
152-
while (count < index && curnode != NULL) {
202+
if (restart) {
203+
nodep = nodep->children;
204+
}
205+
while (count < relative_index && nodep != NULL) {
153206
count++;
154-
curnode = curnode->next;
207+
nodep = nodep->next;
155208
}
156-
itemnode = curnode;
209+
itemnode = nodep;
157210
} else {
158-
if (nodep->type == XML_DOCUMENT_NODE || nodep->type == XML_HTML_DOCUMENT_NODE) {
159-
nodep = xmlDocGetRootElement((xmlDoc *) nodep);
160-
} else {
161-
nodep = nodep->children;
211+
if (restart) {
212+
if (nodep->type == XML_DOCUMENT_NODE || nodep->type == XML_HTML_DOCUMENT_NODE) {
213+
nodep = xmlDocGetRootElement((xmlDoc*) nodep);
214+
} else {
215+
nodep = nodep->children;
216+
}
162217
}
163-
itemnode = dom_get_elements_by_tag_name_ns_raw(nodep, (char *) objmap->ns, (char *) objmap->local, &count, index);
218+
itemnode = dom_get_elements_by_tag_name_ns_raw(nodep, (char *) objmap->ns, (char *) objmap->local, &count, relative_index);
164219
}
220+
cache_itemnode = true;
165221
}
166222
}
167223
}
168224
}
169225

170226
if (itemnode) {
171227
DOM_RET_OBJ(itemnode, &ret, objmap->baseobj);
228+
if (cache_itemnode) {
229+
/* Hold additional reference for the cache, must happen before releasing the cache
230+
* because we might be the last reference holder. */
231+
dom_object *cached_obj = Z_DOMOBJ_P(return_value);
232+
GC_ADDREF(&cached_obj->std);
233+
/* If the tag is stale, all cached data is useless. Otherwise only the cached object is useless. */
234+
if (php_dom_is_cache_tag_stale_from_node(&objmap->cache_tag, nodep)) {
235+
php_dom_mark_cache_tag_up_to_date_from_node(&objmap->cache_tag, nodep);
236+
reset_objmap_cache(objmap);
237+
} else {
238+
objmap_cache_release_cached_obj(objmap);
239+
}
240+
objmap->cached_obj_index = index;
241+
objmap->cached_obj = cached_obj;
242+
}
172243
return;
173244
}
174245
}

ext/dom/parentnode.c

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -259,6 +259,9 @@ void dom_parent_node_append(dom_object *context, zval *nodes, int nodesc)
259259
{
260260
xmlNode *parentNode = dom_object_get_node(context);
261261
xmlNodePtr newchild, prevsib;
262+
263+
php_dom_invalidate_node_list_cache(parentNode->doc);
264+
262265
xmlNode *fragment = dom_zvals_to_fragment(context->document, parentNode, nodes, nodesc);
263266

264267
if (fragment == NULL) {
@@ -296,6 +299,8 @@ void dom_parent_node_prepend(dom_object *context, zval *nodes, int nodesc)
296299
return;
297300
}
298301

302+
php_dom_invalidate_node_list_cache(parentNode->doc);
303+
299304
xmlNodePtr newchild, nextsib;
300305
xmlNode *fragment = dom_zvals_to_fragment(context->document, parentNode, nodes, nodesc);
301306

@@ -376,6 +381,8 @@ void dom_parent_node_after(dom_object *context, zval *nodes, int nodesc)
376381

377382
doc = prevsib->doc;
378383

384+
php_dom_invalidate_node_list_cache(doc);
385+
379386
/* Spec step 4: convert nodes into fragment */
380387
fragment = dom_zvals_to_fragment(context->document, parentNode, nodes, nodesc);
381388

@@ -425,6 +432,8 @@ void dom_parent_node_before(dom_object *context, zval *nodes, int nodesc)
425432

426433
doc = nextsib->doc;
427434

435+
php_dom_invalidate_node_list_cache(doc);
436+
428437
/* Spec step 4: convert nodes into fragment */
429438
fragment = dom_zvals_to_fragment(context->document, parentNode, nodes, nodesc);
430439

@@ -480,6 +489,8 @@ void dom_child_node_remove(dom_object *context)
480489
return;
481490
}
482491

492+
php_dom_invalidate_node_list_cache(context->document->ptr);
493+
483494
while (children) {
484495
if (children == child) {
485496
xmlUnlinkNode(child);

0 commit comments

Comments
 (0)