Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 7 additions & 1 deletion core/scrapers/mp_crawler.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
# -*- coding: utf-8 -*-

from typing import Union
import httpx
from bs4 import BeautifulSoup
from datetime import datetime
Expand All @@ -11,7 +12,7 @@
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/605.1.15 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/604.1 Edg/112.0.100.0'}


async def mp_crawler(url: str, logger) -> (int, dict):
async def mp_crawler(url: str, logger) -> tuple[int, Union[set, dict]]:
if not url.startswith('https://mp.weixin.qq.com') and not url.startswith('http://mp.weixin.qq.com'):
logger.warning(f'{url} is not a mp url, you should not use this function')
return -5, {}
Expand All @@ -34,6 +35,11 @@ async def mp_crawler(url: str, logger) -> (int, dict):

soup = BeautifulSoup(response.text, 'html.parser')

if url.startswith('https://mp.weixin.qq.com/mp/appmsgalbum'):
# 文章目录
urls = {li.attrs['data-link'].replace("http://", "https://", 1) for li in soup.find_all('li', class_='album__list-item')}
return 1, set(urls)

# Get the original release date first
pattern = r"var createTime = '(\d{4}-\d{2}-\d{2}) \d{2}:\d{2}'"
match = re.search(pattern, response.text)
Expand Down