99class HtmlContentFilter
1010{
1111 /**
12- * Remove all the script elements from the given HTML document.
12+ * Remove all active content from the given HTML document.
13+ * This aims to cover anything which can dynamically deal with, or send, data
14+ * like any JavaScript actions or form content.
1315 */
14- public static function removeScriptsFromDocument (HtmlDocument $ doc )
16+ public static function removeActiveContentFromDocument (HtmlDocument $ doc ): void
1517 {
1618 // Remove standard script tags
1719 $ scriptElems = $ doc ->queryXPath ('//script ' );
@@ -21,7 +23,7 @@ public static function removeScriptsFromDocument(HtmlDocument $doc)
2123 $ badLinks = $ doc ->queryXPath ('//*[ ' . static ::xpathContains ('@href ' , 'javascript: ' ) . '] ' );
2224 static ::removeNodes ($ badLinks );
2325
24- // Remove forms with calls to JavaScript URI
26+ // Remove elements with form-like attributes with calls to JavaScript URI
2527 $ badForms = $ doc ->queryXPath ('//*[ ' . static ::xpathContains ('@action ' , 'javascript: ' ) . '] | //*[ ' . static ::xpathContains ('@formaction ' , 'javascript: ' ) . '] ' );
2628 static ::removeNodes ($ badForms );
2729
@@ -47,25 +49,71 @@ public static function removeScriptsFromDocument(HtmlDocument $doc)
4749 // Remove 'on*' attributes
4850 $ onAttributes = $ doc ->queryXPath ('//@*[starts-with(name(), \'on \')] ' );
4951 static ::removeAttributes ($ onAttributes );
52+
53+ // Remove form elements
54+ $ formElements = ['form ' , 'fieldset ' , 'button ' , 'textarea ' , 'select ' ];
55+ foreach ($ formElements as $ formElement ) {
56+ $ matchingFormElements = $ doc ->queryXPath ('// ' . $ formElement );
57+ static ::removeNodes ($ matchingFormElements );
58+ }
59+
60+ // Remove non-checkbox inputs
61+ $ inputsToRemove = $ doc ->queryXPath ('//input ' );
62+ /** @var DOMElement $input */
63+ foreach ($ inputsToRemove as $ input ) {
64+ $ type = strtolower ($ input ->getAttribute ('type ' ));
65+ if ($ type !== 'checkbox ' ) {
66+ $ input ->parentNode ->removeChild ($ input );
67+ }
68+ }
69+
70+ // Remove form attributes
71+ $ formAttrs = ['form ' , 'formaction ' , 'formmethod ' , 'formtarget ' ];
72+ foreach ($ formAttrs as $ formAttr ) {
73+ $ matchingFormAttrs = $ doc ->queryXPath ('//@ ' . $ formAttr );
74+ static ::removeAttributes ($ matchingFormAttrs );
75+ }
5076 }
5177
5278 /**
53- * Remove scripts from the given HTML string.
79+ * Remove active content from the given HTML string.
80+ * This aims to cover anything which can dynamically deal with, or send, data
81+ * like any JavaScript actions or form content.
5482 */
55- public static function removeScriptsFromHtmlString (string $ html ): string
83+ public static function removeActiveContentFromHtmlString (string $ html ): string
5684 {
5785 if (empty ($ html )) {
5886 return $ html ;
5987 }
6088
6189 $ doc = new HtmlDocument ($ html );
62- static ::removeScriptsFromDocument ($ doc );
90+ static ::removeActiveContentFromDocument ($ doc );
6391
6492 return $ doc ->getBodyInnerHtml ();
6593 }
6694
6795 /**
68- * Create a xpath contains statement with a translation automatically built within
96+ * Alias using the old method name to avoid potential compatibility breaks during patch release.
97+ * To remove in future feature release.
98+ * @deprecated Use removeActiveContentFromDocument instead.
99+ */
100+ public static function removeScriptsFromDocument (HtmlDocument $ doc ): void
101+ {
102+ static ::removeActiveContentFromDocument ($ doc );
103+ }
104+
105+ /**
106+ * Alias using the old method name to avoid potential compatibility breaks during patch release.
107+ * To remove in future feature release.
108+ * @deprecated Use removeActiveContentFromHtmlString instead.
109+ */
110+ public static function removeScriptsFromHtmlString (string $ html ): string
111+ {
112+ return static ::removeActiveContentFromHtmlString ($ html );
113+ }
114+
115+ /**
116+ * Create an x-path 'contains' statement with a translation automatically built within
69117 * to affectively search in a cases-insensitive manner.
70118 */
71119 protected static function xpathContains (string $ property , string $ value ): string
0 commit comments