Skip to content

Commit 023d771

Browse files
committed
针对jupyter文件中锚点问题修复
1 parent 5197706 commit 023d771

File tree

2 files changed

+17
-8
lines changed

2 files changed

+17
-8
lines changed

README.md

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,13 @@ python脚本下载[jyy老师](https://jyywiki.cn/)的OS课件。尽量使用最
1010
所有的课件内容都在`Courseware`这个文件夹里。
1111

1212
# 更新
13+
## V2.4 2023-09-14
14+
忘了验证其他年份课件的下载了,然后发现jupyter课件中的`href`属性中提供的链接是这样的
15+
```html
16+
<h3 id="Demo%EF%BC%9A%E4%BD%BF%E7%94%A8-tar-%E5%91%BD%E4%BB%A4">Demo&#65306;&#20351;&#29992; tar &#21629;&#20196;<a class="anchor-link" href="lect1.ipynb.html#Demo%EF%BC%9A%E4%BD%BF%E7%94%A8-tar-%E5%91%BD%E4%BB%A4">&#182;</a>
17+
```
18+
导致解析后下载出现了奇怪的文件夹,添加一行`link = urlparse(link).path`即可解决问题,这行代码直接提取出`lect1.ipn.html`
19+
1320
## V2.3 2023-09-14
1421
算是比较重要的一次更新了,本次更新的代码解决了课件中文件下载不全的问题,之前下载不全是因为要猜测会有什么样类型的文件,现在不用猜测了,只需要跟着链接下载即可。
1522

main.py

Lines changed: 10 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
import requests
44
from bs4 import BeautifulSoup
55
from urllib.parse import urljoin
6+
from urllib.parse import urlparse
67

78

89
def download(url_, path_):
@@ -18,7 +19,7 @@ def download(url_, path_):
1819
else:
1920
with open(path_, 'wb') as file:
2021
file.write(response.content)
21-
print(f"\033[32m已下载 \033[0m{path_}")
22+
print(f"\033[32m已下载 \033[0m文件链接 {url_}, 文件路径{path_}")
2223
else:
2324
print(f"\033[91m无法下载文件链接:\033[0m{url_}, \033[91m状态码:\033[0m{response.status_code}")
2425

@@ -30,6 +31,7 @@ class OSCourseware:
3031
WITHOUT_DOWNLOAD = [
3132
"https://jyywiki.cn/pages/OS/2022/Labs/lab-makefile.png", # 404
3233
"https://jyywiki.cn/OS/2021/slides/Slides_Author", # 404
34+
"https://jyywiki.cn/OS/2022/slides/Slides_Author", # 404
3335
"https://jyywiki.cn/index.html" # unnecessary
3436
]
3537
KEY_YEAR = {'A': "2021", 'B': "2022", 'C': "2023", 'D': "ALL", '': "2023"}
@@ -60,7 +62,7 @@ def build_courseware_url_path(year_):
6062
elif self.year_input != "Invalid":
6163
self.sources_url_path_pairs.update(build_courseware_url_path(self.year_input))
6264
if self.year_input != "2023":
63-
self.WITHOUT_DOWNLOAD.append(f'{self.BASE_URL}/OS/2023/index.html')
65+
self.WITHOUT_DOWNLOAD.append(f'{self.BASE_URL}/OS/2023/index.html') # 避免在其他文件中误下载2023/index.html
6466
else:
6567
print("\033[91m输入非法,程序退出")
6668
sys.exit()
@@ -85,22 +87,22 @@ def file_analyse(self, filepath):
8587
return
8688

8789
# 提取文件中的相对链接
88-
_links_tags = soup.find_all(href=True) + soup.find_all(src=True)
90+
_links_tags = soup.find_all(href=True) + soup.find_all(src=True) + soup.find_all(data=True)
8991
_links_attr = []
9092
for link in _links_tags:
9193
_links_attr.extend([link.get("href"), link.get("src"), link.get("data")])
9294
_links_attr = list(set(_links_attr)) # 去除重复的元素
9395

94-
# 补全完整的文件地址和链接
96+
# 以filepath指定的文件为参照补全文件中的网址以及在本地存储的地址
9597
for link in _links_attr:
9698
if link is not None and not link.startswith(("http", "data")): # data是ipynb.html文件资源
97-
# 以filepath指定的文件为参照补全文件中的网址以及在本地存储的地址
98-
path = os.path.normpath(os.path.join(os.path.dirname(filepath), link.replace("/", "\\")))
99+
link = urlparse(link).path # 清除锚点
100+
absolute_path = os.path.normpath(os.path.join(os.path.dirname(filepath), link.replace("/", "\\")))
99101
try:
100-
relative_path = path.split(self.BASE_DIR + os.sep + self.COURSEWARE_DIR)[1]
102+
relative_path = absolute_path.split(self.BASE_DIR + os.sep + self.COURSEWARE_DIR)[1]
101103
url = urljoin(self.BASE_URL, relative_path.replace("\\", "/"))
102104
if url not in self.WITHOUT_DOWNLOAD:
103-
self.sources_url_path_pairs.update({url: path})
105+
self.sources_url_path_pairs.update({url: absolute_path})
104106
self.WITHOUT_DOWNLOAD.append(url)
105107
except IndexError:
106108
continue

0 commit comments

Comments
 (0)