refactor: clean_html() to improve HTML sanitization

nbalne · nbalne · commit 6ad4ad2db9ed · 2025-09-30T07:16:29.000Z
diff --git a/course_discovery/apps/course_metadata/tests/test_utils.py b/course_discovery/apps/course_metadata/tests/test_utils.py
@@ -880,10 +880,21 @@ class UtilsTests(TestCase):
             '<p>Some text</p>\n<p>· Item 1</p>\n<ul>\n<li>Item 2</li>\n</ul>\n<p>Regular paragraph</p>\n<p>· Item 3</p>'
         )
     )
+    @ddt.data(
+        (
+            '<p><em>The content of this course also forms part of the six-month online <a href="https://example.com">Example Link</a></em></p>',  # pylint: disable=line-too-long
+            '<p><em>The content of this course also forms part of the six-month online <a href="https://example.com">Example Link</a></em></p>'  # pylint: disable=line-too-long
+        ),
+        (
+            '<div><p>online course.</p><p><strong>Module 1:</strong></p></div>',
+            '<p>online course. <strong>Module 1:</strong></p>'
+        )
+    )
     @ddt.unpack
     def test_clean_html(self, content, expected):
         """ Verify the method removes unnecessary HTML attributes. """
-        assert clean_html(content) == expected
+        result = clean_html(content)
+        assert result == expected, f"\nExpected:\n{expected}\nGot:\n{result}"
 
     def test_skill_data_transformation(self):
         category_data = {
diff --git a/course_discovery/apps/course_metadata/utils.py b/course_discovery/apps/course_metadata/utils.py
@@ -774,6 +774,8 @@ def clean_html(content):
     (indicating right-to-left direction), this method will ensure that the 'dir' attribute is preserved
     or added to maintain consistency with the original content.
     """
+    if not content:
+        return ''
     LIST_TAGS = ['ul', 'ol']
     is_list_with_dir_attr_present = False
 
@@ -790,12 +792,13 @@ def clean_html(content):
     cleaned = cleaned.replace('<p><b></b></p>', '')
     html_converter = HTML2TextWithLangSpans(bodywidth=None)
     html_converter.wrap_links = False
-    cleaned = html_converter.handle(cleaned).strip()
-    cleaned = markdown.markdown(cleaned)
-    for tag in LIST_TAGS:
-        cleaned = cleaned.replace(f'<{tag}>', f'<{tag} dir="rtl">') if is_list_with_dir_attr_present else cleaned
-
-    return cleaned
+    markdown_text = html_converter.handle(cleaned).strip()
+    cleaned = markdown.markdown(markdown_text)
+    cleaned = re.sub(r'([^\s>])\s*(<a\b)', r'\1 \2', cleaned)
+    if is_list_with_dir_attr_present:
+        for tag in LIST_TAGS:
+            cleaned = cleaned.replace(f'<{tag}>', f'<{tag} dir="rtl">')
+    return cleaned.strip()
 
 
 def get_file_from_drive_link(image_url):