Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@
* fixed URL rewriting when running from /
* added support for link rewriting in <object> element
* prevent from raising error if element doesn't have the attribute with url
* use non greedy match for CSS URL links (shortest string matching `url()` format)
* fix namespace of target only if link doesn't have a netloc

# 1.2.0

Expand Down
9 changes: 7 additions & 2 deletions src/zimscraperlib/zim/rewriting.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,7 @@ def fix_target_for(

# remove namespace from source ; and join it with target
flat_target = pathlib.Path(*source.parts[1:]).parent.joinpath(target)

if str(root.resolve()) == "/":
flat_target = flat_target.relative_to(root)
else:
Expand Down Expand Up @@ -143,7 +144,11 @@ def fix_links_in_html(url: str, content: str) -> str:
html_link = node.attrs[key]

# parse as a URL to extract querystring and fragment
_, _, target, query, fragment = urllib.parse.urlsplit(html_link)
_, netloc, target, query, fragment = urllib.parse.urlsplit(html_link)

# do nothing for links with netloc
if netloc:
continue

# use source as target if there's none
if not target:
Expand Down Expand Up @@ -199,7 +204,7 @@ def encapsulate(url):

# split whole content on `url()` pattern to retrieve a list composed of
# alternatively pre-pattern text and inside url() –– actual target text
parts = re.split(r"url\((.+)\)", content)
parts = re.split(r"url\((.+?)\)", content)
for index, _ in enumerate(parts):
if index % 2 == 0: # skip even lines (0, 2, ..) as those are CSS code
continue
Expand Down
8 changes: 8 additions & 0 deletions tests/zim/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ def html_str():
<li><a href="download/toto.txt">text file</a></li>
<li><a href="dest.html">HTML link</a></li>
<li><a href="no-extension">no ext link</a></li>
<li><a href="http://www.example.com/index/sample.html">external link</a></li>
<li><a media="">no href link</a></li>
<object data="download/toto.jpg" width="300" height="200"></object>
<script src="assets/js/bootstrap/bootsrap.css?v=20190101"></script>
Expand Down Expand Up @@ -120,6 +121,13 @@ def css_str_with_fonts():
background-image: url("font/DroidSans.ttf#toto");
background-image: url("font/DroidSans.ttf?yolo#toto");
}
@font-face {
font-family: 'Open Sans';
font-style: normal;
font-weight: 400;
src: local('Open Sans Regular'), local('OpenSans-Regular'),
url('font/DroidSans.ttf') format('truetype'), /* Test non greedy URL extraction */
}
"""


Expand Down
8 changes: 4 additions & 4 deletions tests/zim/test_rewriting.py
Original file line number Diff line number Diff line change
Expand Up @@ -129,20 +129,20 @@ def test_fix_target_for(tmp_path, monkeypatch):
)
# make sure target to a file is fixed when not checking existence
assert (
fix_file_target_for(tmp_path, "home.html", "files/dl.pdf", False)
fix_file_target_for(tmp_path, Path("home.html"), Path("files/dl.pdf"), False)
== "../I/files/dl.pdf"
)
# make sure target is NOT fixed when target is not present and we requested it to
assert (
fix_file_target_for(tmp_path, "home.html", "files/dl.pdf", True)
fix_file_target_for(tmp_path, Path("home.html"), Path("files/dl.pdf"), True)
== "files/dl.pdf"
)
# make sure target to a file is fixed when checking existence and is present
ff = Path(tmp_path / "files")
ff.mkdir()
ff.joinpath("dl.pdf").touch()
assert (
fix_file_target_for(tmp_path, "home.html", "files/dl.pdf", True)
fix_file_target_for(tmp_path, Path("home.html"), Path("files/dl.pdf"), True)
== "../I/files/dl.pdf"
)
# special behavior when CWD is /
Expand Down Expand Up @@ -256,7 +256,7 @@ def test_fix_urls_in_css_file(tmp_path, css_str, font, css_str_with_fonts):
with open(fpath, "r") as fh:
content = fh.read()
assert content != css_str
assert content.count("data:") == 12
assert content.count("data:") == 13


@pytest.mark.parametrize(
Expand Down