33import requests
44from bs4 import BeautifulSoup
55from urllib .parse import urljoin
6+ from urllib .parse import urlparse
67
78
89def download (url_ , path_ ):
@@ -18,7 +19,7 @@ def download(url_, path_):
1819 else :
1920 with open (path_ , 'wb' ) as file :
2021 file .write (response .content )
21- print (f"\033 [32m已下载 \033 [0m { path_ } " )
22+ print (f"\033 [32m已下载 \033 [0m文件链接 { url_ } , 文件路径 { path_ } " )
2223 else :
2324 print (f"\033 [91m无法下载文件链接:\033 [0m{ url_ } , \033 [91m状态码:\033 [0m{ response .status_code } " )
2425
@@ -30,6 +31,7 @@ class OSCourseware:
3031 WITHOUT_DOWNLOAD = [
3132 "https://jyywiki.cn/pages/OS/2022/Labs/lab-makefile.png" , # 404
3233 "https://jyywiki.cn/OS/2021/slides/Slides_Author" , # 404
34+ "https://jyywiki.cn/OS/2022/slides/Slides_Author" , # 404
3335 "https://jyywiki.cn/index.html" # unnecessary
3436 ]
3537 KEY_YEAR = {'A' : "2021" , 'B' : "2022" , 'C' : "2023" , 'D' : "ALL" , '' : "2023" }
@@ -60,7 +62,7 @@ def build_courseware_url_path(year_):
6062 elif self .year_input != "Invalid" :
6163 self .sources_url_path_pairs .update (build_courseware_url_path (self .year_input ))
6264 if self .year_input != "2023" :
63- self .WITHOUT_DOWNLOAD .append (f'{ self .BASE_URL } /OS/2023/index.html' )
65+ self .WITHOUT_DOWNLOAD .append (f'{ self .BASE_URL } /OS/2023/index.html' ) # 避免在其他文件中误下载2023/index.html
6466 else :
6567 print ("\033 [91m输入非法,程序退出" )
6668 sys .exit ()
@@ -85,22 +87,22 @@ def file_analyse(self, filepath):
8587 return
8688
8789 # 提取文件中的相对链接
88- _links_tags = soup .find_all (href = True ) + soup .find_all (src = True )
90+ _links_tags = soup .find_all (href = True ) + soup .find_all (src = True ) + soup . find_all ( data = True )
8991 _links_attr = []
9092 for link in _links_tags :
9193 _links_attr .extend ([link .get ("href" ), link .get ("src" ), link .get ("data" )])
9294 _links_attr = list (set (_links_attr )) # 去除重复的元素
9395
94- # 补全完整的文件地址和链接
96+ # 以filepath指定的文件为参照补全文件中的网址以及在本地存储的地址
9597 for link in _links_attr :
9698 if link is not None and not link .startswith (("http" , "data" )): # data是ipynb.html文件资源
97- # 以filepath指定的文件为参照补全文件中的网址以及在本地存储的地址
98- path = os .path .normpath (os .path .join (os .path .dirname (filepath ), link .replace ("/" , "\\ " )))
99+ link = urlparse ( link ). path # 清除锚点
100+ absolute_path = os .path .normpath (os .path .join (os .path .dirname (filepath ), link .replace ("/" , "\\ " )))
99101 try :
100- relative_path = path .split (self .BASE_DIR + os .sep + self .COURSEWARE_DIR )[1 ]
102+ relative_path = absolute_path .split (self .BASE_DIR + os .sep + self .COURSEWARE_DIR )[1 ]
101103 url = urljoin (self .BASE_URL , relative_path .replace ("\\ " , "/" ))
102104 if url not in self .WITHOUT_DOWNLOAD :
103- self .sources_url_path_pairs .update ({url : path })
105+ self .sources_url_path_pairs .update ({url : absolute_path })
104106 self .WITHOUT_DOWNLOAD .append (url )
105107 except IndexError :
106108 continue
0 commit comments