Skip to content

Commit b5778cd

Browse files
committed
Fixes #24 - Fix internal links and don't allow root-relative links
1 parent 334656e commit b5778cd

File tree

3 files changed

+117
-15
lines changed

3 files changed

+117
-15
lines changed

CHANGELOG.md

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,4 +9,5 @@
99
- removed javascript dependencies from repository
1010
- add support for webm on systems without native support
1111
- fix small favicons
12-
- use pylibzim for creating ZIMs
12+
- use pylibzim for creating ZIMs
13+
- fix internal links

openedx2zim/scraper.py

Lines changed: 114 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -177,8 +177,7 @@ def get_course_id(self, url, course_page_name, course_prefix, instance_url):
177177
]
178178
if "%3" in clean_id: # course_id seems already encode
179179
return clean_id
180-
else:
181-
return urllib.parse.quote_plus(clean_id)
180+
return urllib.parse.quote_plus(clean_id)
182181

183182
def prepare_mooc_data(self):
184183
self.instance_url = self.instance_connection.instance_config["instance_url"]
@@ -293,13 +292,10 @@ def annex_extra_page(self, tab_href, tab_org_path):
293292

294293
# its a content page
295294
if just_content is not None:
296-
html_content = self.dl_dependencies(
297-
content=str(just_content), output_path=output_path, path_from_html="",
298-
)
299295
self.annexed_pages.append(
300296
{
301297
"output_path": output_path,
302-
"content": html_content,
298+
"content": str(just_content),
303299
"title": soup_page.find("title").get_text(),
304300
}
305301
)
@@ -311,7 +307,7 @@ def annex_extra_page(self, tab_href, tab_org_path):
311307
self.book_lists.append(
312308
{
313309
"output_path": output_path,
314-
"book_list": self.get_book_list(book, output_path),
310+
"book_list": book,
315311
"dir_path": tab_org_path,
316312
}
317313
)
@@ -322,6 +318,7 @@ def annex_extra_page(self, tab_href, tab_org_path):
322318
"Oh it's seems we does not support one type of extra content (in top bar) :"
323319
+ tab_org_path
324320
)
321+
shutil.rmtree(output_path, ignore_errors=True)
325322
return None
326323

327324
def get_tab_path_and_name(self, tab_text, tab_href):
@@ -344,17 +341,21 @@ def get_tab_path_and_name(self, tab_text, tab_href):
344341
tab_path = "/index.html"
345342
elif "wiki" in tab_org_path and self.add_wiki:
346343
self.wiki = MoocWiki(self)
347-
self.wiki.annex_wiki()
348344
tab_path = f"{str(self.wiki.wiki_path)}/index.html"
349345
elif "forum" in tab_org_path and self.add_forum:
350346
self.forum = MoocForum(self)
351-
self.forum.annex_forum()
352347
tab_path = "forum/index.html"
353348
elif ("wiki" not in tab_org_path) and ("forum" not in tab_org_path):
354-
tab_path = self.annex_extra_page(tab_href, tab_org_path)
349+
# check if already in dict
350+
for _, val in self.course_tabs.items():
351+
if val == f"{tab_org_path}/index.html":
352+
tab_path = val
353+
break
354+
else:
355+
tab_path = self.annex_extra_page(tab_href, tab_org_path)
355356
return tab_name, tab_path
356357

357-
def annex(self):
358+
def get_course_tabs(self):
358359
logger.info("Getting course tabs ...")
359360
content = self.instance_connection.get_page(self.course_url)
360361
if not content:
@@ -375,6 +376,32 @@ def annex(self):
375376
if tab_name is not None and tab_path is not None:
376377
self.course_tabs[tab_name] = tab_path
377378

379+
def annex(self):
380+
self.get_course_tabs()
381+
logger.info("Downloading content for extra pages ...")
382+
for page in self.annexed_pages:
383+
page["content"] = self.dl_dependencies(
384+
content=page["content"],
385+
output_path=page["output_path"],
386+
path_from_html="",
387+
)
388+
389+
logger.info("Processing book lists")
390+
for item in self.book_lists:
391+
item["book_list"] = self.get_book_list(
392+
item["book_list"], item["output_path"]
393+
)
394+
395+
# annex wiki if available
396+
if hasattr(self, "wiki"):
397+
logger.info("Annexing wiki ...")
398+
self.wiki.annex_wiki()
399+
400+
# annex forum if available
401+
if hasattr(self, "forum"):
402+
logger.info("Annexing forum ...")
403+
self.forum.annex_forum()
404+
378405
def download_and_get_filename(
379406
self, src, output_path, with_ext=None, filter_ext=None
380407
):
@@ -483,6 +510,71 @@ def download_iframes_from_html(self, html_body, output_path, path_from_html):
483510
iframe.attrib["src"] = f"{path_from_html}/{filename}"
484511
return bool(iframes)
485512

513+
def handle_jump_to_paths(self, target_path):
514+
def check_descendants_and_return_path(xblock_extractor):
515+
if xblock_extractor.xblock_json["type"] in ["vertical", "course"]:
516+
return xblock_extractor.relative_path + "/index.html"
517+
if not xblock_extractor.descendants:
518+
return None
519+
return check_descendants_and_return_path(xblock_extractor.descendants[0])
520+
521+
for xblock_extractor in self.xblock_extractor_objects:
522+
if (
523+
urllib.parse.urlparse(xblock_extractor.xblock_json["lms_web_url"]).path
524+
== target_path
525+
):
526+
# we have a path match, we now check xblock type to redirect properly
527+
# Only vertical and course xblocks have HTMLs
528+
return check_descendants_and_return_path(xblock_extractor)
529+
530+
def relative_dots(self, output_path):
531+
relative_path = output_path.resolve().relative_to(self.build_dir.resolve())
532+
path_length = len(relative_path.parts)
533+
if path_length >= 5:
534+
# from a vertical, the root is 5 jumps deep
535+
return "../" * 5
536+
return "../" * path_length
537+
538+
def update_root_relative_path(self, anchor, fixed_path, output_path):
539+
if fixed_path:
540+
anchor.attrib["href"] = self.relative_dots(output_path) + fixed_path
541+
else:
542+
anchor.attrib["href"] = self.instance_url + anchor.attrib["href"]
543+
544+
def rewrite_internal_links(self, html_body, output_path):
545+
anchors = html_body.xpath("//a")
546+
path_prefix = f"{self.instance_connection.instance_config['course_prefix']}{urllib.parse.unquote_plus(self.course_id)}"
547+
path_fixed = False
548+
for anchor in anchors:
549+
if "href" in anchor.attrib:
550+
src = urllib.parse.urlparse(anchor.attrib["href"])
551+
if (
552+
src.netloc == self.instance_url or src.netloc == ""
553+
) and src.path.startswith(path_prefix):
554+
if "jump_to" in src.path:
555+
# handle jump to paths (to an xblock)
556+
fixed_path = self.handle_jump_to_paths(src.path)
557+
if not fixed_path:
558+
# xblock may be one of those from which a vertical is consisted of
559+
# thus check if the parent has the valid path
560+
# we only need to check one layer deep as there's single layer of xblocks beyond vertical
561+
fixed_path = self.handle_jump_to_paths(
562+
str(pathlib.Path(src.path).parent)
563+
)
564+
self.update_root_relative_path(anchor, fixed_path, output_path)
565+
path_fixed = True
566+
else:
567+
# handle tab paths
568+
_, tab_path = self.get_tab_path_and_name(
569+
tab_text="", tab_href=src.path
570+
)
571+
self.update_root_relative_path(anchor, tab_path, output_path)
572+
path_fixed = True
573+
elif src.netloc == "" and src.path.startswith("/"):
574+
self.update_root_relative_path(anchor, None, output_path)
575+
path_fixed = True
576+
return path_fixed
577+
486578
def dl_dependencies(self, content, output_path, path_from_html):
487579
html_body = lxml.html.fromstring(str(content))
488580
imgs = self.download_images_from_html(html_body, output_path, path_from_html)
@@ -495,7 +587,16 @@ def dl_dependencies(self, content, output_path, path_from_html):
495587
iframes = self.download_iframes_from_html(
496588
html_body, output_path, path_from_html
497589
)
498-
if imgs or docs or css_files or js_files or sources or iframes:
590+
rewritten_links = self.rewrite_internal_links(html_body, output_path)
591+
if (
592+
imgs
593+
or docs
594+
or css_files
595+
or js_files
596+
or sources
597+
or iframes
598+
or rewritten_links
599+
):
499600
content = lxml.html.tostring(html_body, encoding="unicode")
500601
return content
501602

@@ -869,7 +970,7 @@ def run(self):
869970
language="eng",
870971
creator=zim_info["creator"],
871972
publisher=self.publisher,
872-
tags=self.tags + ["_category:openedx", "openedx"],
973+
tags=self.tags + ["_category:other", "openedx"],
873974
scraper=SCRAPER,
874975
without_fulltext_index=True if self.no_fulltext_index else False,
875976
)

requirements.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ Jinja2==2.11.2
66
mistune==2.0.0a4
77
requests>=2.24,<3.0
88
iso-639==0.4.5
9-
zimscraperlib@git+git://github.com/openzim/python_scraperlib.git@fix_link_rewriting#egg=zimscraperlib
9+
zimscraperlib>=1.2.1,<1.3
1010
kiwixstorage>=0.3,<1.0
1111
pif==0.8.2
1212
# youtube-dl should be updated as frequently as possible

0 commit comments

Comments
 (0)