zyhgithub
diff --git a/‎core/backend.py‎
Lines changed: 45 additions & 0 deletions b/‎core/backend.py‎
Lines changed: 45 additions & 0 deletions
diff --git a/‎core/dm.py‎
Lines changed: 0 additions & 47 deletions b/‎core/dm.py‎
Lines changed: 0 additions & 47 deletions
diff --git a/‎core/docker_entrypoint.sh‎
Lines changed: 7 additions & 0 deletions b/‎core/docker_entrypoint.sh‎
Lines changed: 7 additions & 0 deletions
diff --git a/‎core/insights/__init__.py‎
Lines changed: 76 additions & 53 deletions b/‎core/insights/__init__.py‎
Lines changed: 76 additions & 53 deletions
diff --git a/‎core/insights/get_info.py‎
Lines changed: 4 additions & 4 deletions b/‎core/insights/get_info.py‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎core/pb/README.md‎
Lines changed: 4 additions & 6 deletions b/‎core/pb/README.md‎
Lines changed: 4 additions & 6 deletions
@@ -0,0 +1,45 @@
+from fastapi import FastAPI, BackgroundTasks
+from pydantic import BaseModel
+from typing import Literal, Optional
+from fastapi.middleware.cors import CORSMiddleware
+from insights import pipeline
+
+
+class Request(BaseModel):
+    """
+    Input model
+    input = {'user_id': str, 'type': str, 'content':str， 'addition': Optional[str]}
+    Type is one of "text", "publicMsg", "site" and "url"；
+    """
+    user_id: str
+    type: Literal["text", "publicMsg", "file", "image", "video", "location", "chathistory", "site", "attachment", "url"]
+    content: str
+    addition: Optional[str] = None
+
+
+app = FastAPI(
+    title="WiseFlow Union Backend",
+    description="From Wiseflow Team.",
+    version="0.1.1",
+    openapi_url="/openapi.json"
+)
+
+app.add_middleware(
+        CORSMiddleware,
+        allow_origins=["*"],
+        allow_credentials=True,
+        allow_methods=["*"],
+        allow_headers=["*"],
+    )
+
+
+@app.get("/")
+def read_root():
+    msg = "Hello, this is Wise Union Backend, version 0.1.1"
+    return {"msg": msg}
+
+
+@app.post("/feed")
+async def call_to_feed(background_tasks: BackgroundTasks, request: Request):
+    background_tasks.add_task(pipeline, _input=request.model_dump())
+    return {"msg": "received well"}
@@ -0,0 +1,7 @@
+#!/bin/bash
+set -o allexport
+source ../.env
+set +o allexport
+uvicorn backend:app --reload --host localhost --port 8077
+#exec uvicorn backend:app --reload --host localhost --port 8077 &
+#exec python background_task.py
@@ -1,12 +1,13 @@
-from ..scrapers import *
-from ..utils.general_utils import extract_urls, compare_phrase_with_list
+# -*- coding: utf-8 -*-
+
+from scrapers import *
+from utils.general_utils import extract_urls, compare_phrase_with_list
 from .get_info import get_info, pb, project_dir, logger, info_rewrite
 import os
 import json
 from datetime import datetime, timedelta
 from urllib.parse import urlparse
 import re
-import time
 
 
 # The XML parsing scheme is not used because there are abnormal characters in the XML code extracted from the weixin public_msg
@@ -18,11 +19,49 @@
 existing_urls = [url['url'] for url in pb.read(collection_name='articles', fields=['url']) if url['url']]
 
 
-def pipeline(_input: dict):
+async def get_articles(urls: list[str], expiration: datetime, cache: dict = {}) -> list[dict]:
+    articles = []
+    for url in urls:
+        logger.debug(f"fetching {url}")
+
+        if url.startswith('https://mp.weixin.qq.com') or url.startswith('http://mp.weixin.qq.com'):
+            flag, result = await mp_crawler(url, logger)
+        else:
+            flag, result = await simple_crawler(url, logger)
+
+        if flag == -7:
+            #  -7 means cannot fetch the html, and other crawlers have no effect.
+            continue
+
+        if flag != 11:
+            flag, result = await llm_crawler(url, logger)
+            if flag != 11:
+                continue
+
+        expiration_date = expiration.strftime('%Y-%m-%d')
+        article_date = int(result['publish_time'])
+        if article_date < int(expiration_date.replace('-', '')):
+            logger.info(f"publish date is {article_date}, too old, skip")
+            continue
+
+        if url in cache:
+            for k, v in cache[url].items():
+                if v:
+                    result[k] = v
+
+        articles.append(result)
+
+    return articles
+
+
+async def pipeline(_input: dict):
     cache = {}
     source = _input['user_id'].split('@')[-1]
     logger.debug(f"received new task, user: {source}, MsgSvrID: {_input['addition']}")
 
+    global existing_urls
+    expiration_date = datetime.now() - timedelta(days=expiration_days)
+
     if _input['type'] == 'publicMsg':
         items = item_pattern.findall(_input["content"])
         # Iterate through all < item > content, extracting < url > and < summary >
@@ -37,73 +76,57 @@ def pipeline(_input: dict):
             cut_off_point = url.find('chksm=')
             if cut_off_point != -1:
                 url = url[:cut_off_point-1]
+            if url in existing_urls:
+                logger.debug(f"{url} has been crawled, skip")
+                continue
             if url in cache:
                 logger.debug(f"{url} already find in item")
                 continue
             summary_match = summary_pattern.search(item)
             summary = summary_match.group(1) if summary_match else None
-            cache[url] = summary
-        urls = list(cache.keys())
+            cache[url] = {'source': source, 'abstract': summary}
+        articles = await get_articles(list(cache.keys()), expiration_date, cache)
 
-    elif _input['type'] == 'text':
+    elif _input['type'] == 'site':
+        # for the site url, Usually an article list page or a website homepage
+        # need to get the article list page
+        # You can use a general scraper, or you can customize a site-specific crawler, see scrapers/README_CN.md
         urls = extract_urls(_input['content'])
         if not urls:
-            logger.debug(f"can not find any url in\n{_input['content']}\npass...")
+            logger.debug(f"can not find any url in\n{_input['content']}")
             return
-    elif _input['type'] == 'url':
-        urls = []
-        pass
-    else:
-        return
-
-    global existing_urls
-
-    for url in urls:
-        if url in existing_urls:
-            logger.debug(f"{url} has been crawled, skip")
-            continue
-
-        logger.debug(f"fetching {url}")
-        if url.startswith('https://mp.weixin.qq.com') or url.startswith('http://mp.weixin.qq.com'):
-            flag, article = mp_crawler(url, logger)
-            if flag == -7:
-                # For mp crawlers, the high probability of -7 is limited by WeChat, just wait 1min.
-                logger.info(f"fetch {url} failed, try to wait 1min and try again")
-                time.sleep(60)
-                flag, article = mp_crawler(url, logger)
-        else:
+        articles = []
+        for url in urls:
             parsed_url = urlparse(url)
             domain = parsed_url.netloc
             if domain in scraper_map:
-                flag, article = scraper_map[domain](url, logger)
+                result = scraper_map[domain](url, logger)
             else:
-                flag, article = simple_crawler(url, logger)
+                result = await general_scraper(url, expiration_date.date(), existing_urls, logger)
+            articles.extend(result)
 
-        if flag == -7:
-            #  -7 means that the network is different, and other crawlers have no effect.
-            logger.info(f"cannot fetch {url}")
-            continue
+    elif _input['type'] == 'text':
+        urls = extract_urls(_input['content'])
+        if not urls:
+            logger.debug(f"can not find any url in\n{_input['content']}\npass...")
+            return
+        articles = await get_articles(urls, expiration_date)
 
-        if flag != 11:
-            logger.info(f"{url} failed with mp_crawler and simple_crawler")
-            flag, article = llm_crawler(url, logger)
-            if flag != 11:
-                logger.info(f"{url} failed with llm_crawler")
-                continue
+    elif _input['type'] == 'url':
+        # this is remained for wechat shared mp_article_card
+        # todo will do it in project awada (need finish the generalMsg api first)
+        articles = []
+    else:
+        return
 
-        expiration_date = datetime.now() - timedelta(days=expiration_days)
-        expiration_date = expiration_date.strftime('%Y-%m-%d')
-        article_date = int(article['publish_time'])
-        if article_date < int(expiration_date.replace('-', '')):
-            logger.info(f"publish date is {article_date}, too old, skip")
+    for article in articles:
+        if article['url'] in existing_urls:
+            # For the case of entering multiple sites at the same time,
+            # there is indeed a situation where duplicate articles are mixed into the same batch
+            logger.debug(f"{article['url']} duplicated, skip")
             continue
 
-        article['source'] = source
-        if cache[url]:
-            article['abstract'] = cache[url]
-
         insights = get_info(f"title: {article['title']}\n\ncontent: {article['content']}")
-
         try:
             article_id = pb.add(collection_name='articles', body=article)
         except Exception as e:
@@ -112,7 +135,7 @@ def pipeline(_input: dict):
                 json.dump(article, f, ensure_ascii=False, indent=4)
             continue
 
-        existing_urls.append(url)
+        existing_urls.append(article['url'])
 
         if not insights:
             continue
 
@@ -1,9 +1,9 @@
-from ..llms.openai_wrapper import openai_llm
-# from ..llms.siliconflow_wrapper import sfa_llm
+from llms.openai_wrapper import openai_llm
+# from llms.siliconflow_wrapper import sfa_llm
 import re
-from ..utils.general_utils import get_logger_level
+from utils.general_utils import get_logger_level
 from loguru import logger
-from ..utils.pb_api import PbTalker
+from utils.pb_api import PbTalker
 import os
 import locale
 
 
@@ -1,13 +1,11 @@
 # for developer
 
-如果你只是用户，无需关注这个文件夹。
-
-对于python开发者，请使用  backend/pb_api.py 模块进行数据库操作
-
-对于js开发者，可以直接启动数据库后，在数据库各个collection页面中的api详情查看接口说明
+download https://pocketbase.io/docs/
 
 ```bash
 cd pb
-./pocketbase --dev admin create test@example.com 123467890 #如果没有初始账号，请用这个命令创建
+xattr -d com.apple.quarantine pocketbase # for Macos
+./pocketbase migrate up # for first run
+./pocketbase --dev admin create test@example.com 123467890 # If you don't have an initial account, please use this command to create it
 ./pocketbase serve
 ```