Skip to content

Commit b683073

Browse files
add start-up scrip
1 parent 31411cd commit b683073

File tree

10 files changed

+70
-177
lines changed

10 files changed

+70
-177
lines changed

.dockerignore

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,5 +10,5 @@ web/node_modules
1010
docker-compose.yaml
1111
Dockerfile
1212
README.md
13-
backend/__pycache__
14-
backend/WStest
13+
core/__pycache__
14+
core/work_dir

.gitignore

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,4 +7,4 @@ __pycache__
77
.env
88
.venv/
99
core/pb/pb_data/
10-
core/WStest/
10+
core/work_dir/

core/docker_entrypoint.sh

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,6 @@
22
set -o allexport
33
source ../.env
44
set +o allexport
5-
uvicorn backend:app --reload --host localhost --port 8077
6-
#exec uvicorn backend:app --reload --host localhost --port 8077 &
7-
#exec python background_task.py
5+
exec pb/pocketbase serve &
6+
exec python tasks.py &
7+
exec uvicorn backend:app --reload --host localhost --port 8077

core/insights/__init__.py

Lines changed: 4 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,6 @@ async def get_articles(urls: list[str], expiration: datetime, cache: dict = {})
2323
articles = []
2424
for url in urls:
2525
logger.debug(f"fetching {url}")
26-
2726
if url.startswith('https://mp.weixin.qq.com') or url.startswith('http://mp.weixin.qq.com'):
2827
flag, result = await mp_crawler(url, logger)
2928
else:
@@ -37,7 +36,7 @@ async def get_articles(urls: list[str], expiration: datetime, cache: dict = {})
3736
flag, result = await llm_crawler(url, logger)
3837
if flag != 11:
3938
continue
40-
39+
existing_urls.append(url)
4140
expiration_date = expiration.strftime('%Y-%m-%d')
4241
article_date = int(result['publish_time'])
4342
if article_date < int(expiration_date.replace('-', '')):
@@ -48,7 +47,6 @@ async def get_articles(urls: list[str], expiration: datetime, cache: dict = {})
4847
for k, v in cache[url].items():
4948
if v:
5049
result[k] = v
51-
5250
articles.append(result)
5351

5452
return articles
@@ -57,7 +55,7 @@ async def get_articles(urls: list[str], expiration: datetime, cache: dict = {})
5755
async def pipeline(_input: dict):
5856
cache = {}
5957
source = _input['user_id'].split('@')[-1]
60-
logger.debug(f"received new task, user: {source}, MsgSvrID: {_input['addition']}")
58+
logger.debug(f"received new task, user: {source}, Addition info: {_input['addition']}")
6159

6260
global existing_urls
6361
expiration_date = datetime.now() - timedelta(days=expiration_days)
@@ -100,7 +98,7 @@ async def pipeline(_input: dict):
10098
parsed_url = urlparse(url)
10199
domain = parsed_url.netloc
102100
if domain in scraper_map:
103-
result = scraper_map[domain](url, logger)
101+
result = scraper_map[domain](url, expiration_date.date(), existing_urls, logger)
104102
else:
105103
result = await general_scraper(url, expiration_date.date(), existing_urls, logger)
106104
articles.extend(result)
@@ -120,12 +118,7 @@ async def pipeline(_input: dict):
120118
return
121119

122120
for article in articles:
123-
if article['url'] in existing_urls:
124-
# For the case of entering multiple sites at the same time,
125-
# there is indeed a situation where duplicate articles are mixed into the same batch
126-
logger.debug(f"{article['url']} duplicated, skip")
127-
continue
128-
121+
logger.debug(f"article: {article['title']}")
129122
insights = get_info(f"title: {article['title']}\n\ncontent: {article['content']}")
130123
try:
131124
article_id = pb.add(collection_name='articles', body=article)
@@ -135,8 +128,6 @@ async def pipeline(_input: dict):
135128
json.dump(article, f, ensure_ascii=False, indent=4)
136129
continue
137130

138-
existing_urls.append(article['url'])
139-
140131
if not insights:
141132
continue
142133
article_tags = set()

core/scrapers/README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
> **This folder is intended for placing crawlers specific to particular sources. Note that the crawlers here should be able to parse the article list URL of the source and return a dictionary of article details.**
1+
**This folder is intended for placing crawlers specific to particular sources. Note that the crawlers here should be able to parse the article list URL of the source and return a dictionary of article details.**
22
>
33
> # Custom Crawler Configuration
44
>

core/scrapers/README_de.md

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,4 @@
1-
2-
> **In diesem Ordner können Crawlers für spezifische Quellen abgelegt werden. Beachten Sie, dass die Crawlers hier in der Lage sein sollten, die URL der Artikelliste der Quelle zu analysieren und ein Wörterbuch mit Artikeldetails zurückzugeben.**
1+
**In diesem Ordner können Crawlers für spezifische Quellen abgelegt werden. Beachten Sie, dass die Crawlers hier in der Lage sein sollten, die URL der Artikelliste der Quelle zu analysieren und ein Wörterbuch mit Artikeldetails zurückzugeben.**
32
>
43
> # Konfiguration des benutzerdefinierten Crawlers
54
>

core/scrapers/README_fr.md

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,4 @@
1-
2-
> **Ce dossier est destiné à accueillir des crawlers spécifiques à des sources particulières. Notez que les crawlers ici doivent être capables de parser l'URL de la liste des articles de la source et de retourner un dictionnaire de détails des articles.**
1+
**Ce dossier est destiné à accueillir des crawlers spécifiques à des sources particulières. Notez que les crawlers ici doivent être capables de parser l'URL de la liste des articles de la source et de retourner un dictionnaire de détails des articles.**
32
>
43
> # Configuration du Crawler Personnalisé
54
>

core/scrapers/general_scraper.py

Lines changed: 21 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,6 @@
22

33
import os
44
from urllib.parse import urlparse
5-
import re
65
from .simple_crawler import simple_crawler
76
from .mp_crawler import mp_crawler
87
import httpx
@@ -42,9 +41,19 @@ def text_from_soup(soup: BeautifulSoup) -> str:
4241

4342
def parse_html_content(out: str) -> dict:
4443
dct = {'title': '', 'abstract': '', 'content': '', 'publish_time': ''}
45-
pattern = re.compile(r'\"\"\"(.*?)\"\"\"', re.DOTALL)
46-
result = pattern.findall(out)
47-
result = result[0].strip()
44+
if '"""' in out:
45+
semaget = out.split('"""')
46+
if len(semaget) > 1:
47+
result = semaget[1].strip()
48+
else:
49+
result = semaget[0].strip()
50+
else:
51+
result = out.strip()
52+
53+
while result.endswith('"'):
54+
result = result[:-1]
55+
result = result.strip()
56+
4857
dict_strs = result.split('||')
4958
if not dict_strs:
5059
dict_strs = result.split('|||')
@@ -67,7 +76,7 @@ def parse_html_content(out: str) -> dict:
6776
return dct
6877

6978

70-
sys_info = '''As an HTML parser, you'll receive a block of HTML code. Your task is to extract its title, summary, content, and publication date, with the date formatted as YYYY-MM-DD. Return the results in the following format (enclosed within triple quotes):
79+
sys_info = '''As an HTML parser, you'll receive a block of HTML code. Your task is to extract its title, summary, content, and publication date, with the date formatted as YYYY-MM-DD. Return the results in the following format:
7180
"""
7281
Title||Summary||Content||Release Date YYYY-MM-DD
7382
"""
@@ -98,7 +107,7 @@ async def llm_crawler(url: str, logger) -> (int, dict):
98107
html_lines = [line.strip() for line in html_lines if line.strip()]
99108
html_text = "\n".join(html_lines)
100109
if len(html_text) > 29999:
101-
logger.warning(f"{url} content too long for llm parsing")
110+
logger.info(f"{url} content too long for llm parsing")
102111
return 0, {}
103112

104113
if not html_text or html_text.startswith('服务器错误') or html_text.startswith(
@@ -177,11 +186,15 @@ async def general_scraper(site: str, expiration: date, existing: list[str], logg
177186
# Parse all URLs
178187
parsed_url = urlparse(site)
179188
base_url = f"{parsed_url.scheme}://{parsed_url.netloc}"
180-
urls = [urljoin(base_url, link["href"]) for link in soup.find_all("a", href=True)]
189+
urls = []
190+
for link in soup.find_all("a", href=True):
191+
absolute_url = urljoin(base_url, link["href"])
192+
if urlparse(absolute_url).netloc == parsed_url.netloc:
193+
urls.append(absolute_url)
181194

182195
if not urls:
183196
# maybe it's an article site
184-
logger.warning(f"can not find any link from {site}, maybe it's an article site...")
197+
logger.info(f"can not find any link from {site}, maybe it's an article site...")
185198
if site in existing:
186199
logger.debug(f"{site} has been crawled before, skip it")
187200
return []

core/scrapers/simple_crawler.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,12 @@ async def simple_crawler(url: str, logger) -> (int, dict):
3636
rawdata = response.content
3737
encoding = chardet.detect(rawdata)['encoding']
3838
text = rawdata.decode(encoding, errors='replace')
39-
result = extractor.extract(text)
39+
try:
40+
result = extractor.extract(text)
41+
except Exception as e:
42+
logger.info(f"gne extracct error: {e}")
43+
return 0, {}
44+
4045
if not result:
4146
logger.error(f"gne cannot extract {url}")
4247
return 0, {}

core/tasks.py

Lines changed: 30 additions & 144 deletions
Original file line numberDiff line numberDiff line change
@@ -1,152 +1,38 @@
1-
"""
2-
通过编辑这个脚本,可以自定义需要的后台任务
3-
"""
4-
import schedule
5-
import time
6-
from topnews import pipeline
7-
from loguru import logger
8-
from utils.pb_api import PbTalker
9-
import os
10-
from utils.general_utils import get_logger_level
11-
from datetime import datetime, timedelta
12-
import pytz
13-
import requests
1+
import asyncio
2+
from insights import pipeline, pb, logger
143

4+
counter = 0
155

16-
project_dir = os.environ.get("PROJECT_DIR", "")
17-
if project_dir:
18-
os.makedirs(project_dir, exist_ok=True)
19-
logger_file = os.path.join(project_dir, 'tasks.log')
20-
dsw_log = get_logger_level()
21-
logger.add(
22-
logger_file,
23-
level=dsw_log,
24-
backtrace=True,
25-
diagnose=True,
26-
rotation="50 MB"
27-
)
286

29-
pb = PbTalker(logger)
30-
utc_now = datetime.now(pytz.utc)
31-
# 减去一天得到前一天的UTC时间
32-
utc_yesterday = utc_now - timedelta(days=1)
33-
utc_last = utc_yesterday.strftime("%Y-%m-%d %H:%M:%S")
34-
35-
36-
def task():
37-
"""
38-
global counter
39-
sites = pb.read('sites', filter='activated=True')
40-
urls = []
41-
for site in sites:
42-
if not site['per_hours'] or not site['url']:
43-
continue
44-
if counter % site['per_hours'] == 0:
45-
urls.append(site['url'])
46-
logger.info(f'\033[0;32m task execute loop {counter}\033[0m')
47-
logger.info(urls)
48-
if urls:
49-
sp(sites=urls)
50-
else:
51-
if counter % 24 == 0:
52-
sp()
53-
else:
54-
print('\033[0;33mno work for this loop\033[0m')
55-
counter += 1
56-
"""
57-
global utc_last
58-
logger.debug(f'last_collect_time: {utc_last}')
59-
datas = pb.read(collection_name='insights', filter=f'updated>="{utc_last}"', fields=['id', 'content', 'tag', 'articles'])
60-
logger.debug(f"got {len(datas)} items")
61-
utc_last = datetime.now(pytz.utc).strftime("%Y-%m-%d %H:%M:%S")
62-
logger.debug(f'now_utc_time: {utc_last}')
63-
64-
tags = pb.read(collection_name='tags', filter=f'activated=True')
65-
tags_dict = {item["id"]: item["name"] for item in tags if item["name"]}
66-
top_news = {}
67-
for id, name in tags_dict.items():
68-
logger.debug(f'tag: {name}')
69-
data = [item for item in datas if item['tag'] == id]
70-
topnew = pipeline(data, logger)
71-
if not topnew:
72-
logger.debug(f'no top news for {name}')
73-
continue
74-
75-
top_news[id] = {}
76-
for content, articles in topnew.items():
77-
content_urls = [pb.read('articles', filter=f'id="{a}"', fields=['url'])[0]['url'] for a in articles]
78-
# 去除重叠内容
79-
# 如果发现重叠内容,哪个标签长就把对应的从哪个标签删除
80-
to_skip = False
81-
for k, v in top_news.items():
82-
to_del_key = None
83-
for c, u in v.items():
84-
if not set(content_urls).isdisjoint(set(u)):
85-
if len(topnew) > len(v):
86-
to_skip = True
87-
else:
88-
to_del_key = c
89-
break
90-
if to_del_key:
91-
del top_news[k][to_del_key]
92-
if to_skip:
93-
break
94-
if not to_skip:
95-
top_news[id][content] = content_urls
96-
97-
if not top_news[id]:
98-
del top_news[id]
99-
100-
if not top_news:
101-
logger.info("no top news today")
7+
async def process_site(site, counter):
8+
if not site['per_hours'] or not site['url']:
1029
return
10+
if counter % site['per_hours'] == 0:
11+
logger.info(f"applying {site['url']}")
12+
request_input = {
13+
"user_id": "schedule_tasks",
14+
"type": "site",
15+
"content": site['url'],
16+
"addition": f"task execute loop {counter + 1}"
17+
}
18+
await pipeline(request_input)
19+
20+
21+
async def schedule_pipeline(interval):
22+
global counter
23+
while True:
24+
sites = pb.read('sites', filter='activated=True')
25+
logger.info(f'task execute loop {counter + 1}')
26+
await asyncio.gather(*[process_site(site, counter) for site in sites])
10327

104-
# 序列化为字符串
105-
top_news_text = {"#党建引领基层治理": [],
106-
"#数字社区": [],
107-
"#优秀活动案例": []}
108-
109-
for id, v in top_news.items():
110-
# top_news[id] = {content: '\n\n'.join(urls) for content, urls in v.items()}
111-
top_news[id] = {content: urls[0] for content, urls in v.items()}
112-
if id == 's3kqj9ek8nvtthr':
113-
top_news_text["#数字社区"].append("\n".join(f"{content}\n{urls}" for content, urls in top_news[id].items()))
114-
elif id == 'qpcgotbqyz3a617':
115-
top_news_text["#优秀活动案例"].append("\n".join(f"{content}\n{urls}" for content, urls in top_news[id].items()))
116-
else:
117-
top_news_text["#党建引领基层治理"].append("\n".join(f"{content}\n{urls}" for content, urls in top_news[id].items()))
118-
119-
top_news_text = {k: "\n".join(v) for k, v in top_news_text.items()}
120-
top_news_text = "\n\n".join(f"{k}\n{v}" for k, v in top_news_text.items())
121-
logger.info(top_news_text)
122-
123-
data = {
124-
"wxid": "R:10860349446619856",
125-
"content": top_news_text
126-
}
127-
try:
128-
response = requests.post("http://localhost:8088/api/sendtxtmsg", json=data)
129-
if response.status_code == 200:
130-
logger.info("send message to wechat success")
131-
time.sleep(1)
132-
data = {
133-
"wxid": "R:10860349446619856",
134-
"content": "[太阳] 今日份的临小助内参来啦!",
135-
"atlist": ["@all"]
136-
}
137-
try:
138-
response = requests.post("http://localhost:8088/api/sendtxtmsg", json=data)
139-
if response.status_code == 200:
140-
logger.info("send notify to wechat success")
141-
except Exception as e:
142-
logger.error(f"send notify to wechat failed: {e}")
143-
except Exception as e:
144-
logger.error(f"send message to wechat failed: {e}")
28+
counter += 1
29+
logger.info(f'task execute loop finished, work after {interval} seconds')
30+
await asyncio.sleep(interval)
14531

14632

147-
schedule.every().day.at("07:38").do(task)
33+
async def main():
34+
interval_hours = 1
35+
interval_seconds = interval_hours * 60 * 60
36+
await schedule_pipeline(interval_seconds)
14837

149-
task()
150-
while True:
151-
schedule.run_pending()
152-
time.sleep(60)
38+
asyncio.run(main())

0 commit comments

Comments
 (0)