Skip to content

Commit 6ad4ad2

Browse files
committed
refactor: clean_html() to improve HTML sanitization
1 parent 6812335 commit 6ad4ad2

File tree

2 files changed

+21
-7
lines changed

2 files changed

+21
-7
lines changed

course_discovery/apps/course_metadata/tests/test_utils.py

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -880,10 +880,21 @@ class UtilsTests(TestCase):
880880
'<p>Some text</p>\n<p>· Item 1</p>\n<ul>\n<li>Item 2</li>\n</ul>\n<p>Regular paragraph</p>\n<p>· Item 3</p>'
881881
)
882882
)
883+
@ddt.data(
884+
(
885+
'<p><em>The content of this course also forms part of the six-month online <a href="https://example.com">Example Link</a></em></p>', # pylint: disable=line-too-long
886+
'<p><em>The content of this course also forms part of the six-month online <a href="https://example.com">Example Link</a></em></p>' # pylint: disable=line-too-long
887+
),
888+
(
889+
'<div><p>online course.</p><p><strong>Module 1:</strong></p></div>',
890+
'<p>online course. <strong>Module 1:</strong></p>'
891+
)
892+
)
883893
@ddt.unpack
884894
def test_clean_html(self, content, expected):
885895
""" Verify the method removes unnecessary HTML attributes. """
886-
assert clean_html(content) == expected
896+
result = clean_html(content)
897+
assert result == expected, f"\nExpected:\n{expected}\nGot:\n{result}"
887898

888899
def test_skill_data_transformation(self):
889900
category_data = {

course_discovery/apps/course_metadata/utils.py

Lines changed: 9 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -774,6 +774,8 @@ def clean_html(content):
774774
(indicating right-to-left direction), this method will ensure that the 'dir' attribute is preserved
775775
or added to maintain consistency with the original content.
776776
"""
777+
if not content:
778+
return ''
777779
LIST_TAGS = ['ul', 'ol']
778780
is_list_with_dir_attr_present = False
779781

@@ -790,12 +792,13 @@ def clean_html(content):
790792
cleaned = cleaned.replace('<p><b></b></p>', '')
791793
html_converter = HTML2TextWithLangSpans(bodywidth=None)
792794
html_converter.wrap_links = False
793-
cleaned = html_converter.handle(cleaned).strip()
794-
cleaned = markdown.markdown(cleaned)
795-
for tag in LIST_TAGS:
796-
cleaned = cleaned.replace(f'<{tag}>', f'<{tag} dir="rtl">') if is_list_with_dir_attr_present else cleaned
797-
798-
return cleaned
795+
markdown_text = html_converter.handle(cleaned).strip()
796+
cleaned = markdown.markdown(markdown_text)
797+
cleaned = re.sub(r'([^\s>])\s*(<a\b)', r'\1 \2', cleaned)
798+
if is_list_with_dir_attr_present:
799+
for tag in LIST_TAGS:
800+
cleaned = cleaned.replace(f'<{tag}>', f'<{tag} dir="rtl">')
801+
return cleaned.strip()
799802

800803

801804
def get_file_from_drive_link(image_url):

0 commit comments

Comments
 (0)