@@ -177,8 +177,7 @@ def get_course_id(self, url, course_page_name, course_prefix, instance_url):
177177        ]
178178        if  "%3"  in  clean_id :  # course_id seems already encode 
179179            return  clean_id 
180-         else :
181-             return  urllib .parse .quote_plus (clean_id )
180+         return  urllib .parse .quote_plus (clean_id )
182181
183182    def  prepare_mooc_data (self ):
184183        self .instance_url  =  self .instance_connection .instance_config ["instance_url" ]
@@ -293,13 +292,10 @@ def annex_extra_page(self, tab_href, tab_org_path):
293292
294293        # its a content page 
295294        if  just_content  is  not None :
296-             html_content  =  self .dl_dependencies (
297-                 content = str (just_content ), output_path = output_path , path_from_html = "" ,
298-             )
299295            self .annexed_pages .append (
300296                {
301297                    "output_path" : output_path ,
302-                     "content" : html_content ,
298+                     "content" : str ( just_content ) ,
303299                    "title" : soup_page .find ("title" ).get_text (),
304300                }
305301            )
@@ -311,7 +307,7 @@ def annex_extra_page(self, tab_href, tab_org_path):
311307            self .book_lists .append (
312308                {
313309                    "output_path" : output_path ,
314-                     "book_list" : self . get_book_list ( book ,  output_path ) ,
310+                     "book_list" : book ,
315311                    "dir_path" : tab_org_path ,
316312                }
317313            )
@@ -322,6 +318,7 @@ def annex_extra_page(self, tab_href, tab_org_path):
322318            "Oh it's seems we does not support one type of extra content (in top bar) :" 
323319            +  tab_org_path 
324320        )
321+         shutil .rmtree (output_path , ignore_errors = True )
325322        return  None 
326323
327324    def  get_tab_path_and_name (self , tab_text , tab_href ):
@@ -344,17 +341,21 @@ def get_tab_path_and_name(self, tab_text, tab_href):
344341            tab_path  =  "/index.html" 
345342        elif  "wiki"  in  tab_org_path  and  self .add_wiki :
346343            self .wiki  =  MoocWiki (self )
347-             self .wiki .annex_wiki ()
348344            tab_path  =  f"{ str (self .wiki .wiki_path )}  
349345        elif  "forum"  in  tab_org_path  and  self .add_forum :
350346            self .forum  =  MoocForum (self )
351-             self .forum .annex_forum ()
352347            tab_path  =  "forum/index.html" 
353348        elif  ("wiki"  not  in tab_org_path ) and  ("forum"  not  in tab_org_path ):
354-             tab_path  =  self .annex_extra_page (tab_href , tab_org_path )
349+             # check if already in dict 
350+             for  _ , val  in  self .course_tabs .items ():
351+                 if  val  ==  f"{ tab_org_path }  :
352+                     tab_path  =  val 
353+                     break 
354+             else :
355+                 tab_path  =  self .annex_extra_page (tab_href , tab_org_path )
355356        return  tab_name , tab_path 
356357
357-     def  annex (self ):
358+     def  get_course_tabs (self ):
358359        logger .info ("Getting course tabs ..." )
359360        content  =  self .instance_connection .get_page (self .course_url )
360361        if  not  content :
@@ -375,6 +376,32 @@ def annex(self):
375376                if  tab_name  is  not None  and  tab_path  is  not None :
376377                    self .course_tabs [tab_name ] =  tab_path 
377378
379+     def  annex (self ):
380+         self .get_course_tabs ()
381+         logger .info ("Downloading content for extra pages ..." )
382+         for  page  in  self .annexed_pages :
383+             page ["content" ] =  self .dl_dependencies (
384+                 content = page ["content" ],
385+                 output_path = page ["output_path" ],
386+                 path_from_html = "" ,
387+             )
388+ 
389+         logger .info ("Processing book lists" )
390+         for  item  in  self .book_lists :
391+             item ["book_list" ] =  self .get_book_list (
392+                 item ["book_list" ], item ["output_path" ]
393+             )
394+ 
395+         # annex wiki if available 
396+         if  hasattr (self , "wiki" ):
397+             logger .info ("Annexing wiki ..." )
398+             self .wiki .annex_wiki ()
399+ 
400+         # annex forum if available 
401+         if  hasattr (self , "forum" ):
402+             logger .info ("Annexing forum ..." )
403+             self .forum .annex_forum ()
404+ 
378405    def  download_and_get_filename (
379406        self , src , output_path , with_ext = None , filter_ext = None 
380407    ):
@@ -483,6 +510,71 @@ def download_iframes_from_html(self, html_body, output_path, path_from_html):
483510                        iframe .attrib ["src" ] =  f"{ path_from_html } { filename }  
484511        return  bool (iframes )
485512
513+     def  handle_jump_to_paths (self , target_path ):
514+         def  check_descendants_and_return_path (xblock_extractor ):
515+             if  xblock_extractor .xblock_json ["type" ] in  ["vertical" , "course" ]:
516+                 return  xblock_extractor .relative_path  +  "/index.html" 
517+             if  not  xblock_extractor .descendants :
518+                 return  None 
519+             return  check_descendants_and_return_path (xblock_extractor .descendants [0 ])
520+ 
521+         for  xblock_extractor  in  self .xblock_extractor_objects :
522+             if  (
523+                 urllib .parse .urlparse (xblock_extractor .xblock_json ["lms_web_url" ]).path 
524+                 ==  target_path 
525+             ):
526+                 # we have a path match, we now check xblock type to redirect properly 
527+                 # Only vertical and course xblocks have HTMLs 
528+                 return  check_descendants_and_return_path (xblock_extractor )
529+ 
530+     def  relative_dots (self , output_path ):
531+         relative_path  =  output_path .resolve ().relative_to (self .build_dir .resolve ())
532+         path_length  =  len (relative_path .parts )
533+         if  path_length  >=  5 :
534+             # from a vertical, the root is 5 jumps deep 
535+             return  "../"  *  5 
536+         return  "../"  *  path_length 
537+ 
538+     def  update_root_relative_path (self , anchor , fixed_path , output_path ):
539+         if  fixed_path :
540+             anchor .attrib ["href" ] =  self .relative_dots (output_path ) +  fixed_path 
541+         else :
542+             anchor .attrib ["href" ] =  self .instance_url  +  anchor .attrib ["href" ]
543+ 
544+     def  rewrite_internal_links (self , html_body , output_path ):
545+         anchors  =  html_body .xpath ("//a" )
546+         path_prefix  =  f"{ self .instance_connection .instance_config ['course_prefix' ]} { urllib .parse .unquote_plus (self .course_id )}  
547+         path_fixed  =  False 
548+         for  anchor  in  anchors :
549+             if  "href"  in  anchor .attrib :
550+                 src  =  urllib .parse .urlparse (anchor .attrib ["href" ])
551+                 if  (
552+                     src .netloc  ==  self .instance_url  or  src .netloc  ==  "" 
553+                 ) and  src .path .startswith (path_prefix ):
554+                     if  "jump_to"  in  src .path :
555+                         # handle jump to paths (to an xblock) 
556+                         fixed_path  =  self .handle_jump_to_paths (src .path )
557+                         if  not  fixed_path :
558+                             # xblock may be one of those from which a vertical is consisted of 
559+                             # thus check if the parent has the valid path 
560+                             # we only need to check one layer deep as there's single layer of xblocks beyond vertical 
561+                             fixed_path  =  self .handle_jump_to_paths (
562+                                 str (pathlib .Path (src .path ).parent )
563+                             )
564+                         self .update_root_relative_path (anchor , fixed_path , output_path )
565+                         path_fixed  =  True 
566+                     else :
567+                         # handle tab paths 
568+                         _ , tab_path  =  self .get_tab_path_and_name (
569+                             tab_text = "" , tab_href = src .path 
570+                         )
571+                         self .update_root_relative_path (anchor , tab_path , output_path )
572+                         path_fixed  =  True 
573+                 elif  src .netloc  ==  ""  and  src .path .startswith ("/" ):
574+                     self .update_root_relative_path (anchor , None , output_path )
575+                     path_fixed  =  True 
576+         return  path_fixed 
577+ 
486578    def  dl_dependencies (self , content , output_path , path_from_html ):
487579        html_body  =  lxml .html .fromstring (str (content ))
488580        imgs  =  self .download_images_from_html (html_body , output_path , path_from_html )
@@ -495,7 +587,16 @@ def dl_dependencies(self, content, output_path, path_from_html):
495587        iframes  =  self .download_iframes_from_html (
496588            html_body , output_path , path_from_html 
497589        )
498-         if  imgs  or  docs  or  css_files  or  js_files  or  sources  or  iframes :
590+         rewritten_links  =  self .rewrite_internal_links (html_body , output_path )
591+         if  (
592+             imgs 
593+             or  docs 
594+             or  css_files 
595+             or  js_files 
596+             or  sources 
597+             or  iframes 
598+             or  rewritten_links 
599+         ):
499600            content  =  lxml .html .tostring (html_body , encoding = "unicode" )
500601        return  content 
501602
@@ -869,7 +970,7 @@ def run(self):
869970                language = "eng" ,
870971                creator = zim_info ["creator" ],
871972                publisher = self .publisher ,
872-                 tags = self .tags  +  ["_category:openedx " , "openedx" ],
973+                 tags = self .tags  +  ["_category:other " , "openedx" ],
873974                scraper = SCRAPER ,
874975                without_fulltext_index = True  if  self .no_fulltext_index  else  False ,
875976            )
0 commit comments