add start-up scrip

bigbrother666sh · bigbrother666sh · commit b683073fdef6 · 2024-06-15T20:04:10.000+08:00
diff --git a/.dockerignore b/.dockerignore
@@ -10,5 +10,5 @@ web/node_modules
 docker-compose.yaml
 Dockerfile
 README.md
-backend/__pycache__
-backend/WStest
+core/__pycache__
+core/work_dir
diff --git a/.gitignore b/.gitignore
@@ -7,4 +7,4 @@ __pycache__
 .env
 .venv/
 core/pb/pb_data/
-core/WStest/
+core/work_dir/
diff --git a/core/docker_entrypoint.sh b/core/docker_entrypoint.sh
@@ -2,6 +2,6 @@
 set -o allexport
 source ../.env
 set +o allexport
-uvicorn backend:app --reload --host localhost --port 8077
-#exec uvicorn backend:app --reload --host localhost --port 8077 &
-#exec python background_task.py
+exec pb/pocketbase serve &
+exec python tasks.py &
+exec uvicorn backend:app --reload --host localhost --port 8077
diff --git a/core/insights/__init__.py b/core/insights/__init__.py
@@ -23,7 +23,6 @@ async def get_articles(urls: list[str], expiration: datetime, cache: dict = {})
     articles = []
     for url in urls:
         logger.debug(f"fetching {url}")
-
         if url.startswith('https://mp.weixin.qq.com') or url.startswith('http://mp.weixin.qq.com'):
             flag, result = await mp_crawler(url, logger)
         else:
@@ -37,7 +36,7 @@ async def get_articles(urls: list[str], expiration: datetime, cache: dict = {})
             flag, result = await llm_crawler(url, logger)
             if flag != 11:
                 continue
-
+        existing_urls.append(url)
         expiration_date = expiration.strftime('%Y-%m-%d')
         article_date = int(result['publish_time'])
         if article_date < int(expiration_date.replace('-', '')):
@@ -48,7 +47,6 @@ async def get_articles(urls: list[str], expiration: datetime, cache: dict = {})
             for k, v in cache[url].items():
                 if v:
                     result[k] = v
-
         articles.append(result)
 
     return articles
@@ -57,7 +55,7 @@ async def get_articles(urls: list[str], expiration: datetime, cache: dict = {})
 async def pipeline(_input: dict):
     cache = {}
     source = _input['user_id'].split('@')[-1]
-    logger.debug(f"received new task, user: {source}, MsgSvrID: {_input['addition']}")
+    logger.debug(f"received new task, user: {source}, Addition info: {_input['addition']}")
 
     global existing_urls
     expiration_date = datetime.now() - timedelta(days=expiration_days)
@@ -100,7 +98,7 @@ async def pipeline(_input: dict):
             parsed_url = urlparse(url)
             domain = parsed_url.netloc
             if domain in scraper_map:
-                result = scraper_map[domain](url, logger)
+                result = scraper_map[domain](url, expiration_date.date(), existing_urls, logger)
             else:
                 result = await general_scraper(url, expiration_date.date(), existing_urls, logger)
             articles.extend(result)
@@ -120,12 +118,7 @@ async def pipeline(_input: dict):
         return
 
     for article in articles:
-        if article['url'] in existing_urls:
-            # For the case of entering multiple sites at the same time,
-            # there is indeed a situation where duplicate articles are mixed into the same batch
-            logger.debug(f"{article['url']} duplicated, skip")
-            continue
-
+        logger.debug(f"article: {article['title']}")
         insights = get_info(f"title: {article['title']}\n\ncontent: {article['content']}")
         try:
             article_id = pb.add(collection_name='articles', body=article)
@@ -135,8 +128,6 @@ async def pipeline(_input: dict):
                 json.dump(article, f, ensure_ascii=False, indent=4)
             continue
 
-        existing_urls.append(article['url'])
-
         if not insights:
             continue
         article_tags = set()
diff --git a/core/scrapers/README.md b/core/scrapers/README.md
@@ -1,4 +1,4 @@
-> **This folder is intended for placing crawlers specific to particular sources. Note that the crawlers here should be able to parse the article list URL of the source and return a dictionary of article details.**
+**This folder is intended for placing crawlers specific to particular sources. Note that the crawlers here should be able to parse the article list URL of the source and return a dictionary of article details.**
 > 
 > # Custom Crawler Configuration
 > 
diff --git a/core/scrapers/README_de.md b/core/scrapers/README_de.md
@@ -1,5 +1,4 @@
-
-> **In diesem Ordner können Crawlers für spezifische Quellen abgelegt werden. Beachten Sie, dass die Crawlers hier in der Lage sein sollten, die URL der Artikelliste der Quelle zu analysieren und ein Wörterbuch mit Artikeldetails zurückzugeben.**
+**In diesem Ordner können Crawlers für spezifische Quellen abgelegt werden. Beachten Sie, dass die Crawlers hier in der Lage sein sollten, die URL der Artikelliste der Quelle zu analysieren und ein Wörterbuch mit Artikeldetails zurückzugeben.**
 > 
 > # Konfiguration des benutzerdefinierten Crawlers
 > 
diff --git a/core/scrapers/README_fr.md b/core/scrapers/README_fr.md
@@ -1,5 +1,4 @@
-
-> **Ce dossier est destiné à accueillir des crawlers spécifiques à des sources particulières. Notez que les crawlers ici doivent être capables de parser l'URL de la liste des articles de la source et de retourner un dictionnaire de détails des articles.**
+**Ce dossier est destiné à accueillir des crawlers spécifiques à des sources particulières. Notez que les crawlers ici doivent être capables de parser l'URL de la liste des articles de la source et de retourner un dictionnaire de détails des articles.**
 > 
 > # Configuration du Crawler Personnalisé
 > 
diff --git a/core/scrapers/general_scraper.py b/core/scrapers/general_scraper.py
@@ -2,7 +2,6 @@
 
 import os
 from urllib.parse import urlparse
-import re
 from .simple_crawler import simple_crawler
 from .mp_crawler import mp_crawler
 import httpx
@@ -42,9 +41,19 @@ def text_from_soup(soup: BeautifulSoup) -> str:
 
 def parse_html_content(out: str) -> dict:
     dct = {'title': '', 'abstract': '', 'content': '', 'publish_time': ''}
-    pattern = re.compile(r'\"\"\"(.*?)\"\"\"', re.DOTALL)
-    result = pattern.findall(out)
-    result = result[0].strip()
+    if '"""' in out:
+        semaget = out.split('"""')
+        if len(semaget) > 1:
+            result = semaget[1].strip()
+        else:
+            result = semaget[0].strip()
+    else:
+        result = out.strip()
+
+    while result.endswith('"'):
+        result = result[:-1]
+        result = result.strip()
+
     dict_strs = result.split('||')
     if not dict_strs:
         dict_strs = result.split('|||')
@@ -67,7 +76,7 @@ def parse_html_content(out: str) -> dict:
     return dct
 
 
-sys_info = '''As an HTML parser, you'll receive a block of HTML code. Your task is to extract its title, summary, content, and publication date, with the date formatted as YYYY-MM-DD. Return the results in the following format (enclosed within triple quotes):
+sys_info = '''As an HTML parser, you'll receive a block of HTML code. Your task is to extract its title, summary, content, and publication date, with the date formatted as YYYY-MM-DD. Return the results in the following format:
 """
 Title||Summary||Content||Release Date YYYY-MM-DD
 """
@@ -98,7 +107,7 @@ async def llm_crawler(url: str, logger) -> (int, dict):
         html_lines = [line.strip() for line in html_lines if line.strip()]
         html_text = "\n".join(html_lines)
         if len(html_text) > 29999:
-            logger.warning(f"{url} content too long for llm parsing")
+            logger.info(f"{url} content too long for llm parsing")
             return 0, {}
 
         if not html_text or html_text.startswith('服务器错误') or html_text.startswith(
@@ -177,11 +186,15 @@ async def general_scraper(site: str, expiration: date, existing: list[str], logg
         # Parse all URLs
         parsed_url = urlparse(site)
         base_url = f"{parsed_url.scheme}://{parsed_url.netloc}"
-        urls = [urljoin(base_url, link["href"]) for link in soup.find_all("a", href=True)]
+        urls = []
+        for link in soup.find_all("a", href=True):
+            absolute_url = urljoin(base_url, link["href"])
+            if urlparse(absolute_url).netloc == parsed_url.netloc:
+                urls.append(absolute_url)
 
     if not urls:
         # maybe it's an article site
-        logger.warning(f"can not find any link from {site}, maybe it's an article site...")
+        logger.info(f"can not find any link from {site}, maybe it's an article site...")
         if site in existing:
             logger.debug(f"{site} has been crawled before, skip it")
             return []
diff --git a/core/scrapers/simple_crawler.py b/core/scrapers/simple_crawler.py
@@ -36,7 +36,12 @@ async def simple_crawler(url: str, logger) -> (int, dict):
         rawdata = response.content
         encoding = chardet.detect(rawdata)['encoding']
         text = rawdata.decode(encoding, errors='replace')
-        result = extractor.extract(text)
+        try:
+            result = extractor.extract(text)
+        except Exception as e:
+            logger.info(f"gne extracct error: {e}")
+            return 0, {}
+
         if not result:
             logger.error(f"gne cannot extract {url}")
             return 0, {}
diff --git a/core/tasks.py b/core/tasks.py
@@ -1,152 +1,38 @@
-"""
-通过编辑这个脚本，可以自定义需要的后台任务
-"""
-import schedule
-import time
-from topnews import pipeline
-from loguru import logger
-from utils.pb_api import PbTalker
-import os
-from utils.general_utils import get_logger_level
-from datetime import datetime, timedelta
-import pytz
-import requests
+import asyncio
+from insights import pipeline, pb, logger
 
+counter = 0
 
-project_dir = os.environ.get("PROJECT_DIR", "")
-if project_dir:
-    os.makedirs(project_dir, exist_ok=True)
-logger_file = os.path.join(project_dir, 'tasks.log')
-dsw_log = get_logger_level()
-logger.add(
-    logger_file,
-    level=dsw_log,
-    backtrace=True,
-    diagnose=True,
-    rotation="50 MB"
-)
 
-pb = PbTalker(logger)
-utc_now = datetime.now(pytz.utc)
-# 减去一天得到前一天的UTC时间
-utc_yesterday = utc_now - timedelta(days=1)
-utc_last = utc_yesterday.strftime("%Y-%m-%d %H:%M:%S")
-
-
-def task():
-    """
-    global counter
-    sites = pb.read('sites', filter='activated=True')
-    urls = []
-    for site in sites:
-        if not site['per_hours'] or not site['url']:
-            continue
-        if counter % site['per_hours'] == 0:
-            urls.append(site['url'])
-    logger.info(f'\033[0;32m task execute loop {counter}\033[0m')
-    logger.info(urls)
-    if urls:
-        sp(sites=urls)
-    else:
-        if counter % 24 == 0:
-            sp()
-        else:
-            print('\033[0;33mno work for this loop\033[0m')
-    counter += 1
-    """
-    global utc_last
-    logger.debug(f'last_collect_time: {utc_last}')
-    datas = pb.read(collection_name='insights', filter=f'updated>="{utc_last}"', fields=['id', 'content', 'tag', 'articles'])
-    logger.debug(f"got {len(datas)} items")
-    utc_last = datetime.now(pytz.utc).strftime("%Y-%m-%d %H:%M:%S")
-    logger.debug(f'now_utc_time: {utc_last}')
-
-    tags = pb.read(collection_name='tags', filter=f'activated=True')
-    tags_dict = {item["id"]: item["name"] for item in tags if item["name"]}
-    top_news = {}
-    for id, name in tags_dict.items():
-        logger.debug(f'tag: {name}')
-        data = [item for item in datas if item['tag'] == id]
-        topnew = pipeline(data, logger)
-        if not topnew:
-            logger.debug(f'no top news for {name}')
-            continue
-
-        top_news[id] = {}
-        for content, articles in topnew.items():
-            content_urls = [pb.read('articles', filter=f'id="{a}"', fields=['url'])[0]['url'] for a in articles]
-            # 去除重叠内容
-            # 如果发现重叠内容，哪个标签长就把对应的从哪个标签删除
-            to_skip = False
-            for k, v in top_news.items():
-                to_del_key = None
-                for c, u in v.items():
-                    if not set(content_urls).isdisjoint(set(u)):
-                        if len(topnew) > len(v):
-                            to_skip = True
-                        else:
-                            to_del_key = c
-                        break
-                if to_del_key:
-                    del top_news[k][to_del_key]
-                if to_skip:
-                    break
-            if not to_skip:
-                top_news[id][content] = content_urls
-
-        if not top_news[id]:
-            del top_news[id]
-
-    if not top_news:
-        logger.info("no top news today")
+async def process_site(site, counter):
+    if not site['per_hours'] or not site['url']:
         return
+    if counter % site['per_hours'] == 0:
+        logger.info(f"applying {site['url']}")
+        request_input = {
+            "user_id": "schedule_tasks",
+            "type": "site",
+            "content": site['url'],
+            "addition": f"task execute loop {counter + 1}"
+        }
+        await pipeline(request_input)
+
+
+async def schedule_pipeline(interval):
+    global counter
+    while True:
+        sites = pb.read('sites', filter='activated=True')
+        logger.info(f'task execute loop {counter + 1}')
+        await asyncio.gather(*[process_site(site, counter) for site in sites])
 
-    # 序列化为字符串
-    top_news_text = {"#党建引领基层治理": [],
-                     "#数字社区": [],
-                     "#优秀活动案例": []}
-
-    for id, v in top_news.items():
-        # top_news[id] = {content: '\n\n'.join(urls) for content, urls in v.items()}
-        top_news[id] = {content: urls[0] for content, urls in v.items()}
-        if id == 's3kqj9ek8nvtthr':
-            top_news_text["#数字社区"].append("\n".join(f"{content}\n{urls}" for content, urls in top_news[id].items()))
-        elif id == 'qpcgotbqyz3a617':
-            top_news_text["#优秀活动案例"].append("\n".join(f"{content}\n{urls}" for content, urls in top_news[id].items()))
-        else:
-            top_news_text["#党建引领基层治理"].append("\n".join(f"{content}\n{urls}" for content, urls in top_news[id].items()))
-
-    top_news_text = {k: "\n".join(v) for k, v in top_news_text.items()}
-    top_news_text = "\n\n".join(f"{k}\n{v}" for k, v in top_news_text.items())
-    logger.info(top_news_text)
-
-    data = {
-        "wxid": "R:10860349446619856",
-        "content": top_news_text
-    }
-    try:
-        response = requests.post("http://localhost:8088/api/sendtxtmsg", json=data)
-        if response.status_code == 200:
-            logger.info("send message to wechat success")
-            time.sleep(1)
-            data = {
-                "wxid": "R:10860349446619856",
-                "content": "[太阳] 今日份的临小助内参来啦！",
-                "atlist": ["@all"]
-            }
-            try:
-                response = requests.post("http://localhost:8088/api/sendtxtmsg", json=data)
-                if response.status_code == 200:
-                    logger.info("send notify to wechat success")
-            except Exception as e:
-                logger.error(f"send notify to wechat failed: {e}")
-    except Exception as e:
-        logger.error(f"send message to wechat failed: {e}")
+        counter += 1
+        logger.info(f'task execute loop finished, work after {interval} seconds')
+        await asyncio.sleep(interval)
 
 
-schedule.every().day.at("07:38").do(task)
+async def main():
+    interval_hours = 1
+    interval_seconds = interval_hours * 60 * 60
+    await schedule_pipeline(interval_seconds)
 
-task()
-while True:
-    schedule.run_pending()
-    time.sleep(60)
+asyncio.run(main())

Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`		`-> This folder is intended for placing crawlers specific to particular sources. Note that the crawlers here should be able to parse the article list URL of the source and return a dictionary of article details.`
	`1`	`+This folder is intended for placing crawlers specific to particular sources. Note that the crawlers here should be able to parse the article list URL of the source and return a dictionary of article details.`
`2`	`2`	`>`
`3`	`3`	`> # Custom Crawler Configuration`
`4`	`4`	`>`
Original file line number	Diff line number	Diff line change
`@@ -1,5 +1,4 @@`
`1`		`-`
`2`		`-> In diesem Ordner können Crawlers für spezifische Quellen abgelegt werden. Beachten Sie, dass die Crawlers hier in der Lage sein sollten, die URL der Artikelliste der Quelle zu analysieren und ein Wörterbuch mit Artikeldetails zurückzugeben.`
	`1`	`+In diesem Ordner können Crawlers für spezifische Quellen abgelegt werden. Beachten Sie, dass die Crawlers hier in der Lage sein sollten, die URL der Artikelliste der Quelle zu analysieren und ein Wörterbuch mit Artikeldetails zurückzugeben.`
`3`	`2`	`>`
`4`	`3`	`> # Konfiguration des benutzerdefinierten Crawlers`
`5`	`4`	`>`
Original file line number	Diff line number	Diff line change
`@@ -1,5 +1,4 @@`
`1`		`-`
`2`		`-> Ce dossier est destiné à accueillir des crawlers spécifiques à des sources particulières. Notez que les crawlers ici doivent être capables de parser l'URL de la liste des articles de la source et de retourner un dictionnaire de détails des articles.`
	`1`	`+Ce dossier est destiné à accueillir des crawlers spécifiques à des sources particulières. Notez que les crawlers ici doivent être capables de parser l'URL de la liste des articles de la source et de retourner un dictionnaire de détails des articles.`
`3`	`2`	`>`
`4`	`3`	`> # Configuration du Crawler Personnalisé`
`5`	`4`	`>`