Skip to content

Commit 71dd59a

Browse files
committed
refactor: clean_html() to improve HTML sanitization
1 parent fa03d89 commit 71dd59a

File tree

2 files changed

+23
-17
lines changed

2 files changed

+23
-17
lines changed

course_discovery/apps/course_metadata/tests/test_utils.py

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -880,10 +880,20 @@ class UtilsTests(TestCase):
880880
'<p>Some text</p>\n<p>· Item 1</p>\n<ul>\n<li>Item 2</li>\n</ul>\n<p>Regular paragraph</p>\n<p>· Item 3</p>'
881881
)
882882
)
883+
@ddt.data(
884+
(
885+
'<p><em>The content of this course also forms part of the six-month online <a href="https://example.com">Example Link</a></em></p>', # pylint: disable=line-too-long
886+
'<p><em>The content of this course also forms part of the six-month online <a href="https://example.com">Example Link</a></em></p>' # pylint: disable=line-too-long
887+
),
888+
(
889+
'<div><p>online course.</p><p><strong>Module 1:</strong></p></div>',
890+
'<p>online course. <strong>Module 1:</strong></p>'
891+
)
892+
)
883893
@ddt.unpack
884894
def test_clean_html(self, content, expected):
885-
""" Verify the method removes unnecessary HTML attributes. """
886-
assert clean_html(content) == expected
895+
result = clean_html(content)
896+
assert result == expected, f"\nExpected:\n{expected}\nGot:\n{result}"
887897

888898
def test_skill_data_transformation(self):
889899
category_data = {

course_discovery/apps/course_metadata/utils.py

Lines changed: 11 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -741,7 +741,6 @@ def handle_tag(self, tag, attrs, start):
741741
"""
742742
if not self.is_p_tag_with_dir:
743743
super().handle_tag(tag, attrs, start)
744-
745744
elif tag not in HTML_TAGS_ATTRIBUTE_WHITELIST and tag != 'span':
746745
if start:
747746
self.outtextf(f'<{tag}')
@@ -774,28 +773,25 @@ def clean_html(content):
774773
(indicating right-to-left direction), this method will ensure that the 'dir' attribute is preserved
775774
or added to maintain consistency with the original content.
776775
"""
776+
if not content:
777+
return ''
777778
LIST_TAGS = ['ul', 'ol']
778779
is_list_with_dir_attr_present = False
779-
780-
cleaned = content.replace('&nbsp;', '') # Keeping the removal of nbsps for historical consistency
781-
# Parse the HTML using BeautifulSoup
780+
cleaned = content.replace('&nbsp;', '')
782781
soup = BeautifulSoup(cleaned, 'lxml')
783-
784782
for tag in soup.find_all(LIST_TAGS, dir="rtl"):
785783
tag.attrs.pop('dir')
786784
is_list_with_dir_attr_present = True
787-
788-
cleaned = str(soup)
789-
# Need to clean empty <b> and <p> tags which are converted to <hr/> by html2text
790-
cleaned = cleaned.replace('<p><b></b></p>', '')
785+
cleaned = str(soup).replace('<p><b></b></p>', '')
791786
html_converter = HTML2TextWithLangSpans(bodywidth=None)
792787
html_converter.wrap_links = False
793-
cleaned = html_converter.handle(cleaned).strip()
794-
cleaned = markdown.markdown(cleaned)
795-
for tag in LIST_TAGS:
796-
cleaned = cleaned.replace(f'<{tag}>', f'<{tag} dir="rtl">') if is_list_with_dir_attr_present else cleaned
797-
798-
return cleaned
788+
markdown_text = html_converter.handle(cleaned).strip()
789+
cleaned = markdown.markdown(markdown_text)
790+
cleaned = re.sub(r'([^\s>])\s*(<a\b)', r'\1 \2', cleaned)
791+
if is_list_with_dir_attr_present:
792+
for tag in LIST_TAGS:
793+
cleaned = cleaned.replace(f'<{tag}>', f'<{tag} dir="rtl">')
794+
return cleaned.strip()
799795

800796

801797
def get_file_from_drive_link(image_url):

0 commit comments

Comments
 (0)