Skip to content

Commit c309cf7

Browse files
authored
feat 解析微信文章目录 (TeamWiseFlow#55)
* feat 解析微信文章目录 * fix mp_crawler should return https url
1 parent bdc3cbf commit c309cf7

File tree

1 file changed

+7
-1
lines changed

1 file changed

+7
-1
lines changed

core/scrapers/mp_crawler.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
# -*- coding: utf-8 -*-
22

3+
from typing import Union
34
import httpx
45
from bs4 import BeautifulSoup
56
from datetime import datetime
@@ -11,7 +12,7 @@
1112
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/605.1.15 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/604.1 Edg/112.0.100.0'}
1213

1314

14-
async def mp_crawler(url: str, logger) -> (int, dict):
15+
async def mp_crawler(url: str, logger) -> tuple[int, Union[set, dict]]:
1516
if not url.startswith('https://mp.weixin.qq.com') and not url.startswith('http://mp.weixin.qq.com'):
1617
logger.warning(f'{url} is not a mp url, you should not use this function')
1718
return -5, {}
@@ -34,6 +35,11 @@ async def mp_crawler(url: str, logger) -> (int, dict):
3435

3536
soup = BeautifulSoup(response.text, 'html.parser')
3637

38+
if url.startswith('https://mp.weixin.qq.com/mp/appmsgalbum'):
39+
# 文章目录
40+
urls = {li.attrs['data-link'].replace("http://", "https://", 1) for li in soup.find_all('li', class_='album__list-item')}
41+
return 1, set(urls)
42+
3743
# Get the original release date first
3844
pattern = r"var createTime = '(\d{4}-\d{2}-\d{2}) \d{2}:\d{2}'"
3945
match = re.search(pattern, response.text)

0 commit comments

Comments
 (0)