Skip to content

Commit 3b11ff3

Browse files
committed
完善课件中各种类型的文件下载,并且精简了代码的实现
1 parent e9f8f31 commit 3b11ff3

File tree

2 files changed

+44
-36
lines changed

2 files changed

+44
-36
lines changed

README.md

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,18 @@ python脚本下载[jyy老师](https://jyywiki.cn/)的OS课件
1313
所有的课件内容都在`Courseware`这个文件夹里
1414

1515
# 更新
16+
## V2.3 2023-09-14
17+
算是比较重要的一次更新了,本次更新的代码解决了课件中文件下载不全的问题,之前下载不全是因为要猜测会有什么样类型的文件,现在不用猜测了,只需要跟着链接下载即可。
18+
19+
问题也随之而来,老师的`html`文件中出现了莫名其妙的路径引用比如
20+
```html
21+
<p class="font-serif my-1"><object alt="" class="center" src="/pages/OS/2021/labs/../../img/flame-graph.svg" width="500px" data="../../../pages/OS/img/flame-graph.svg"></object></p>
22+
```
23+
首先这个办法是没有问题的,确实识别到了众多文件类型,`cc`,`go`,`svg`,`pdf`等等。
24+
25+
但是这个问题出现的时候直接导致代码出现了`IndexError`,查了原因就是`/pages/OS/2021/labs/../../img/flame-graph.svg`惹的祸,然后识别出现问题,可我又要下载怎么办呢?难不成要进行特判?我不能接受,然后看到了后面的`data`,然后就通过`data`来下载这个文件了。~~速度我管不着,代码简洁才是真。~~
26+
27+
1628
## V2.2 2023-09-13
1729
这一版代码比V2.1要节省一个变量,其他的没有变化。优化到这里好像差不多到头了。可能还要着手解决下载速度的问题。
1830

main.py

Lines changed: 32 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -6,53 +6,48 @@
66

77

88
def download(url_, path_):
9-
folder_path = os.path.dirname(path_) # 检查路径,对不存在的路径进行创建
10-
if not os.path.exists(folder_path):
11-
os.makedirs(folder_path)
12-
if not os.path.exists(path_): # 检查文件是否存在,对本地不存在的文件进行下载
13-
response = requests.get(url_)
14-
if response.status_code == 200:
15-
if path_.endswith((".css", ".js", ".html", ".c", "cpp", ".py", ".sh", ".S")):
16-
with open(path_, 'w', encoding="utf-8") as file:
17-
file.write(response.text)
18-
else:
19-
# 非文本文件的下载
20-
with open(path_, 'wb') as file:
21-
file.write(response.content)
22-
print(f"\033[32m已下载 \033[0m{path_}")
9+
if not os.path.exists(os.path.dirname(path_)):
10+
os.makedirs(os.path.dirname(path_))
11+
if os.path.exists(path_):
12+
return
13+
response = requests.get(url_)
14+
if response.status_code == 200:
15+
if response.headers['Content-Type'].startswith('text'):
16+
with open(path_, 'w', encoding="utf-8") as file:
17+
file.write(response.text)
2318
else:
24-
print(f"\033[91m无法下载文件链接:\033[0m{url_}, \033[91m状态码:\033[0m{response.status_code}")
19+
with open(path_, 'wb') as file:
20+
file.write(response.content)
21+
print(f"\033[32m已下载 \033[0m{path_}")
22+
else:
23+
print(f"\033[91m无法下载文件链接:\033[0m{url_}, \033[91m状态码:\033[0m{response.status_code}")
2524

2625

2726
class OSCourseware:
2827
BASE_URL = "https://jyywiki.cn"
29-
SOURCE_FILE_TYPE = (
30-
".png", ".jpg", ".gif", ".webp", "jpeg",
31-
".js", ".css", ".html",
32-
".c", ".h", ".cpp", ".py", ".sh", ".S", ".lua", ".txt"
33-
)
28+
BASE_DIR = ""
3429
COURSEWARE_DIR = "Courseware"
3530
WITHOUT_DOWNLOAD = [
3631
"https://jyywiki.cn/pages/OS/2022/Labs/lab-makefile.png", # 404
37-
"https://jyywiki.cn/index.html" # unnecessary
32+
"https://jyywiki.cn/OS/2021/slides/Slides_Author", # 404
33+
"https://jyywiki.cn/index.html" # unnecessary
3834
]
3935
KEY_YEAR = {'A': "2021", 'B': "2022", 'C': "2023", 'D': "ALL", '': "2023"}
4036
year_input = ''
41-
current_dir = ''
42-
sources_url_path_pairs = {} # 不用去重
37+
sources_url_path_pairs = {}
4338

4439
def __init__(self):
45-
self.current_dir = os.path.join(os.getcwd(), self.COURSEWARE_DIR)
4640
self.file_download_option()
4741
self.file_download()
48-
print("下载完成")
42+
print("\033[32m下载完成")
4943

5044
def file_download_option(self):
5145
def build_courseware_url_path(year_):
5246
url_ = f'{self.BASE_URL}/OS/{year_}'
53-
path_ = f'{self.current_dir}\\OS\\{year_}\\index.html'
47+
path_ = f'{os.path.join(self.BASE_DIR, self.COURSEWARE_DIR)}\\OS\\{year_}\\index.html'
5448
return {url_: path_}
5549

50+
self.BASE_DIR = os.getcwd()
5651
self.year_input = input("无法下载的文件会提示。下载成功后提示“下载成功”\n" +
5752
"通过选项下载对应年份课件,回车默认下载2023年课件,输入其他符号则退出\n" +
5853
"\033[32mA\033[0m:2021 \033[32mB\033[0m:2022 "
@@ -67,7 +62,7 @@ def build_courseware_url_path(year_):
6762
if self.year_input != "2023":
6863
self.WITHOUT_DOWNLOAD.append(f'{self.BASE_URL}/OS/2023/index.html')
6964
else:
70-
print("输入非法,程序退出")
65+
print("\033[91m输入非法,程序退出")
7166
sys.exit()
7267

7368
def file_download(self):
@@ -93,21 +88,22 @@ def file_analyse(self, filepath):
9388
_links_tags = soup.find_all(href=True) + soup.find_all(src=True)
9489
_links_attr = []
9590
for link in _links_tags:
96-
_links_attr.extend([link.get("href"), link.get("src")])
91+
_links_attr.extend([link.get("href"), link.get("src"), link.get("data")])
9792
_links_attr = list(set(_links_attr)) # 去除重复的元素
9893

9994
# 补全完整的文件地址和链接
10095
for link in _links_attr:
101-
if link is None or link.startswith(("http", "data")): # data是ipynb.html文件资源
102-
continue
103-
if link.endswith(self.SOURCE_FILE_TYPE):
96+
if link is not None and not link.startswith(("http", "data")): # data是ipynb.html文件资源
10497
# 以filepath指定的文件为参照补全文件中的网址以及在本地存储的地址
10598
path = os.path.normpath(os.path.join(os.path.dirname(filepath), link.replace("/", "\\")))
106-
relative_path = path.split(os.getcwd() + os.sep + self.COURSEWARE_DIR)[1]
107-
url = urljoin(self.BASE_URL, relative_path.replace("\\", "/"))
108-
if url not in self.WITHOUT_DOWNLOAD:
109-
self.sources_url_path_pairs.update({url: path})
110-
self.WITHOUT_DOWNLOAD.append(url)
99+
try:
100+
relative_path = path.split(self.BASE_DIR + os.sep + self.COURSEWARE_DIR)[1]
101+
url = urljoin(self.BASE_URL, relative_path.replace("\\", "/"))
102+
if url not in self.WITHOUT_DOWNLOAD:
103+
self.sources_url_path_pairs.update({url: path})
104+
self.WITHOUT_DOWNLOAD.append(url)
105+
except IndexError:
106+
continue
111107

112108

113109
courseware = OSCourseware()

0 commit comments

Comments
 (0)