openzim · rgaudin · Jul 20, 2020 · Jul 19, 2020 · Jul 20, 2020
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -3,6 +3,8 @@
 * fixed URL rewriting when running from /
 * added support for link rewriting in <object> element
 * prevent from raising error if element doesn't have the attribute with url
+* use non greedy match for CSS URL links (shortest string matching `url()` format)
+* fix namespace of target only if link doesn't have a netloc
 
 # 1.2.0
 

diff --git a/src/zimscraperlib/zim/rewriting.py b/src/zimscraperlib/zim/rewriting.py
@@ -89,6 +89,7 @@ def fix_target_for(
 
     # remove namespace from source ; and join it with target
     flat_target = pathlib.Path(*source.parts[1:]).parent.joinpath(target)
+
     if str(root.resolve()) == "/":
         flat_target = flat_target.relative_to(root)
     else:
@@ -143,7 +144,11 @@ def fix_links_in_html(url: str, content: str) -> str:
             html_link = node.attrs[key]
 
             # parse as a URL to extract querystring and fragment
-            _, _, target, query, fragment = urllib.parse.urlsplit(html_link)
+            _, netloc, target, query, fragment = urllib.parse.urlsplit(html_link)
+
+            # do nothing for links with netloc
+            if netloc:
+                continue
 
             # use source as target if there's none
             if not target:
@@ -199,7 +204,7 @@ def encapsulate(url):
 
     # split whole content on `url()` pattern to retrieve a list composed of
     # alternatively pre-pattern text and inside url() –– actual target text
-    parts = re.split(r"url\((.+)\)", content)
+    parts = re.split(r"url\((.+?)\)", content)
     for index, _ in enumerate(parts):
         if index % 2 == 0:  # skip even lines (0, 2, ..) as those are CSS code
             continue

diff --git a/tests/zim/conftest.py b/tests/zim/conftest.py
@@ -33,6 +33,7 @@ def html_str():
     <li><a href="download/toto.txt">text file</a></li>
     <li><a href="dest.html">HTML link</a></li>
     <li><a href="no-extension">no ext link</a></li>
+    <li><a href="http://www.example.com/index/sample.html">external link</a></li>
     <li><a media="">no href link</a></li>
 <object data="download/toto.jpg" width="300" height="200"></object>
 <script src="assets/js/bootstrap/bootsrap.css?v=20190101"></script>
@@ -120,6 +121,13 @@ def css_str_with_fonts():
     background-image: url("font/DroidSans.ttf#toto");
     background-image: url("font/DroidSans.ttf?yolo#toto");
 }
+@font-face {
+    font-family: 'Open Sans';
+    font-style: normal;
+    font-weight: 400;
+    src: local('Open Sans Regular'), local('OpenSans-Regular'),
+        url('font/DroidSans.ttf') format('truetype'), /* Test non greedy URL extraction */
+}
 """
 
 

diff --git a/tests/zim/test_rewriting.py b/tests/zim/test_rewriting.py
@@ -129,20 +129,20 @@ def test_fix_target_for(tmp_path, monkeypatch):
     )
     # make sure target to a file is fixed when not checking existence
     assert (
-        fix_file_target_for(tmp_path, "home.html", "files/dl.pdf", False)
+        fix_file_target_for(tmp_path, Path("home.html"), Path("files/dl.pdf"), False)
         == "../I/files/dl.pdf"
     )
     # make sure target is NOT fixed when target is not present and we requested it to
     assert (
-        fix_file_target_for(tmp_path, "home.html", "files/dl.pdf", True)
+        fix_file_target_for(tmp_path, Path("home.html"), Path("files/dl.pdf"), True)
         == "files/dl.pdf"
     )
     # make sure target to a file is fixed when checking existence and is present
     ff = Path(tmp_path / "files")
     ff.mkdir()
     ff.joinpath("dl.pdf").touch()
     assert (
-        fix_file_target_for(tmp_path, "home.html", "files/dl.pdf", True)
+        fix_file_target_for(tmp_path, Path("home.html"), Path("files/dl.pdf"), True)
         == "../I/files/dl.pdf"
     )
     # special behavior when CWD is /
@@ -256,7 +256,7 @@ def test_fix_urls_in_css_file(tmp_path, css_str, font, css_str_with_fonts):
     with open(fpath, "r") as fh:
         content = fh.read()
         assert content != css_str
-        assert content.count("data:") == 12
+        assert content.count("data:") == 13
 
 
 @pytest.mark.parametrize(