66from bs4 import BeautifulSoup
77from urllib .parse import urlparse
88
9+
910BASE_URL = "https://jyywiki.cn"
1011BASE_DIR = os .path .join (os .getcwd (), "Courseware" )
1112WITHOUT_DOWNLOAD = [
2425 "https://jyywiki.cn/ISER2023/1-intro/" , # unnecessary
2526 "https://jyywiki.cn/index.html" , # unnecessary
2627]
27- KEY_YEAR = {'A' : "2021" , 'B' : "2022" , 'C' : "2023" , 'D' : "ALL" , '' : "2023" }
28-
28+ fail_download = []
2929sources_url_path_pairs = {}
3030
3131
32+ # 通过某一文件路径以及文件中出现的相对链接,拼接成文件在本地的存储地址
3233def get_full_path (path , link ):
33- pathdir = os .path .dirname (path ).replace ("/" , os .sep )
34- filepath = ""
34+ pathdir = os .path .dirname (path ).replace ("/" , os .sep ) # 用os.sep替换文件中提取出来的路径(文件中的路径都是`/`)
3535 if link .startswith ("/" ):
36- if link .find ("../" ) != - 1 :
36+ if link .find ("../" ) != - 1 : # 滤过`/path/to/file/../path/to/file`
3737 return None
38- filepath = os .path .join (BASE_DIR , link [1 :].replace ("/" , os .sep ))
38+ return os .path .join (BASE_DIR , link [1 :].replace ("/" , os .sep )) # 舍弃第一个字符`/`
3939 elif link .startswith ("../" ):
4040 urlsplit = link .split ("/" )
4141 pathsplit = pathdir .split (os .sep )
42- count = sum ([- 1 for item in urlsplit if item == ".." ])
43- filepath = os .path .join (os .sep .join (pathsplit [:count ]), os .sep .join (urlsplit [- count :]))
42+ count = sum ([- 1 for item in urlsplit if item == ".." ]) # 多个`../`这样的路径的处理
43+ return os .path .join (os .sep .join (pathsplit [:count ]), os .sep .join (urlsplit [- count :]))
4444 else :
45- filepath = os .path .join (pathdir , link .replace ("/" , os .sep ))
46-
47- return filepath
45+ return os .path .join (pathdir , link .replace ("/" , os .sep ))
4846
4947
5048def get_full_url (path ):
51- return path .replace (BASE_DIR , BASE_URL ).replace ("\\ " , "/" )
49+ return path .replace (BASE_DIR , BASE_URL ).replace ("\\ " , "/" ) # url都是`/`,替换path中所有的`\`为`/`
5250
5351
54- def download (url_ , path_ ):
55- if not os .path .exists (os .path .dirname (path_ )):
56- os .makedirs (os .path .dirname (path_ ))
57- if os .path .exists (path_ ):
52+ def download (url , path ):
53+ if not os .path .exists (os .path .dirname (path )):
54+ os .makedirs (os .path .dirname (path ))
55+ if os .path .exists (path ):
5856 return True
59- response = requests .get (url_ )
57+ response = requests .get (url )
6058 if response .status_code == 200 :
6159 if response .headers ['Content-Type' ].startswith ('text' ):
62- with open (path_ , 'w' , encoding = "utf-8" ) as file :
60+ with open (path , 'w' , encoding = "utf-8" ) as file :
6361 file .write (response .text )
6462 else :
65- with open (path_ , 'wb' ) as file :
63+ with open (path , 'wb' ) as file : # 非文本文件
6664 file .write (response .content )
67- print (f"\033 [32m已下载 \033 [0m文件链接 { url_ } , 文件路径 { path_ } " )
65+ print (f"\033 [32m已下载 \033 [0m文件链接 { url } , 文件路径 { path } " )
6866 return True
6967 else :
70- print ( f" \033 [91m无法下载文件链接:状态码: \033 [0m { response .status_code } \033 [0m { url_ } \033 [91m" )
68+ fail_download . append ([ response .status_code , url , path ] )
7169 return False
7270
7371
7472def file_download_option ():
7573 global sources_url_path_pairs
7674 global WITHOUT_DOWNLOAD
75+ KEY_YEAR = {'A' : "2021" , 'B' : "2022" , 'C' : "2023" , 'D' : "ALL" , '' : "2023" }
76+
7777 def build_courseware_url_path (year ):
7878 url_ = f'{ BASE_URL } /OS/{ year } /index.html'
7979 path_ = os .path .join (BASE_DIR , "OS" , year , "index.html" )
@@ -83,7 +83,7 @@ def build_courseware_url_path(year):
8383 "通过选项下载对应年份课件,回车默认下载2023年课件,输入其他符号则退出\n " +
8484 "\033 [32mA\033 [0m:2021 \033 [32mB\033 [0m:2022 "
8585 "\033 [32mC\033 [0m:2023 \033 [32mD\033 [0m:All\n " )
86- year = KEY_YEAR .get (year , "Invalid" ) # 将按键转化为年份
86+ year = KEY_YEAR .get (year , "Invalid" ) # 将输入转化为年份
8787
8888 if year == "ALL" :
8989 for item in ['2021' , '2022' , '2023' ]:
@@ -109,6 +109,10 @@ def file_download():
109109 file_analyse (_path )
110110 else :
111111 WITHOUT_DOWNLOAD .append (_url )
112+ if fail_download :
113+ print ("\033 [91m无法下载如下文件:\033 [0m" )
114+ for code , url , path in fail_download : # 输出失败的文件下载以及应该在本地存储的位置
115+ print (f"\033 [91m状态码:\033 [0m{ code } \033 [0m{ url } \033 [91m 应存放:f{ path } " )
112116
113117
114118# 提取每个文件中的链接
@@ -118,20 +122,19 @@ def file_analyse(filepath):
118122 # 对非HTML文件不做分析
119123 if filepath .endswith (".html" ):
120124 with open (filepath , 'r' , encoding = 'utf-8' ) as file :
121- content = file .read ()
122- soup = BeautifulSoup (content , 'html.parser' )
125+ soup = BeautifulSoup (file .read (), 'html.parser' )
123126 else :
124127 return
125128
126129 # 提取文件中的相对链接
127- _links_tags = soup .find_all (href = True ) + soup .find_all (src = True ) + soup .find_all (data = True )
128- _links_attr = []
129- for link in _links_tags :
130- _links_attr .extend ([link .get ("href" ), link .get ("src" ), link .get ("data" )])
131- _links_attr = list (set (_links_attr )) # 去除重复的元素
130+ links_tags = soup .find_all (href = True ) + soup .find_all (src = True ) + soup .find_all (data = True )
131+ links_attr = []
132+ for link in links_tags :
133+ links_attr .extend ([link .get ("href" ), link .get ("src" ), link .get ("data" )])
134+ links_attr = list (set (links_attr )) # 去除重复的元素
132135
133136 # 以filepath指定的文件为参照补全文件中的网址以及在本地存储的地址
134- for link in _links_attr :
137+ for link in links_attr :
135138 if link is None or link .startswith (("http" , "data" )): # data是ipynb.html文件资源
136139 continue
137140 link = urlparse (link ).path # 清除锚点
@@ -151,8 +154,7 @@ def file_fix():
151154 for item in filename :
152155 filepath = os .path .join (BASE_DIR , "OS" , "2023" , "build" , item )
153156 with open (filepath , 'r' , encoding = 'utf-8' ) as file :
154- content = file .read ()
155- change = re .sub (r'/OS/2023/slides/' , '../slides/' , content )
157+ change = re .sub (r'/OS/2023/slides/' , '../slides/' , file .read ())
156158
157159 with open (filepath , 'w' , encoding = 'utf-8' ) as file :
158160 file .write (change )
@@ -163,8 +165,7 @@ def file_decode():
163165 for item in files :
164166 if item .endswith (".html" ):
165167 with open (os .path .join (root , item ), 'r' , encoding = 'utf-8' ) as file :
166- content = file .read ()
167- change = html .unescape (content )
168+ change = html .unescape (file .read ())
168169
169170 with open (os .path .join (root , item ), 'w' , encoding = 'utf-8' ) as file :
170171 file .write (change )
0 commit comments