Skip to content

Commit a91b174

Browse files
Merge pull request #1 from AngelikiBoura/still-bitting
For issue #162 Add different regex pattern to search for meta tags
2 parents d9763db + 17e4780 commit a91b174

File tree

2 files changed

+17
-1
lines changed

2 files changed

+17
-1
lines changed

tests/test_html.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -669,3 +669,14 @@ def test_inside_script(self):
669669
get_meta_refresh(body, baseurl, ignore_tags=()),
670670
(0.0, "http://example.org/foobar_required"),
671671
)
672+
673+
def test_redirections_in_different_ordering__in_meta_tag(self):
674+
baseurl = 'http://localhost:8000'
675+
url1 = '<html><head><meta http-equiv="refresh" content="0;url=dummy.html"></head></html>'
676+
url2 = '<html><head><meta content="0;url=dummy.html" http-equiv="refresh"></head></html>'
677+
self.assertEqual(
678+
get_meta_refresh(url1, baseurl), (0.0, 'http://localhost:8000/dummy.html')
679+
)
680+
self.assertEqual(
681+
get_meta_refresh(url2, baseurl), (0.0, 'http://localhost:8000/dummy.html')
682+
)

w3lib/html.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,11 @@
2121
r'<meta\s[^>]*http-equiv[^>]*refresh[^>]*content\s*=\s*(?P<quote>["\'])(?P<int>(\d*\.)?\d+)\s*;\s*url=\s*(?P<url>.*?)(?P=quote)',
2222
re.DOTALL | re.IGNORECASE,
2323
)
24+
_meta_refresh_re2 = re.compile(
25+
r'<meta\s[^>]*content\s*=\s*(?P<quote>["\'])(?P<int>(\d*\.)?\d+)\s*;\s*url=\s*(?P<url>.*?)(?P=quote)\shttp-equiv="refresh"',
26+
re.DOTALL | re.IGNORECASE,
27+
)
28+
2429
_cdata_re = re.compile(
2530
r"((?P<cdata_s><!\[CDATA\[)(?P<cdata_d>.*?)(?P<cdata_e>\]\]>))", re.DOTALL
2631
)
@@ -338,7 +343,7 @@ def get_meta_refresh(
338343
raise
339344
utext = remove_tags_with_content(utext, ignore_tags)
340345
utext = remove_comments(replace_entities(utext))
341-
m = _meta_refresh_re.search(utext)
346+
m = _meta_refresh_re.search(utext) or _meta_refresh_re2.search(utext)
342347
if m:
343348
interval = float(m.group("int"))
344349
url = safe_url_string(m.group("url").strip(" \"'"), encoding)

0 commit comments

Comments
 (0)