diff --git a/.gitignore b/.gitignore index f83b6c757..4242f3a93 100644 --- a/.gitignore +++ b/.gitignore @@ -4,6 +4,4 @@ __pycache__ .yarn .env* **/node_modules/** -.DS_Store -data -qdrant_storage/** \ No newline at end of file +.DS_Store \ No newline at end of file diff --git a/aero/.gitignore b/aero/.gitignore index e1979a048..ad4a1f17f 100644 --- a/aero/.gitignore +++ b/aero/.gitignore @@ -174,8 +174,3 @@ poetry.toml pyrightconfig.json # End of https://www.toptal.com/developers/gitignore/api/python -唤起 Refly -⌘B -关闭 -Refly -Refly \ No newline at end of file diff --git a/aero/data/mp.weixin.json b/aero/data/mp.weixin.json new file mode 100644 index 000000000..c5338b1b5 --- /dev/null +++ b/aero/data/mp.weixin.json @@ -0,0 +1,30 @@ +[ + { + "url": "https://mp.weixin.qq.com/s/s7H_0nzCw57-FY0yUnCXjQ", + "title": "RAG 修炼手册|一文讲透 RAG 背后的技术" + }, + { + "url": "https://mp.weixin.qq.com/s/5mcafRElVm3-i9SqmqUoCw", + "title": "AI 硬件万字长文:一直游到海水变蓝" + }, + { + "url": "https://mp.weixin.qq.com/s/zL-Ro2JU3F_-yhu9RlSjlw", + "title": "Z Product | 华人之光!PyTorch之母的复旦女生开创模型平台,Benchmark、红杉等顶流投资" + }, + { + "url": "https://mp.weixin.qq.com/s/UO8hSKiQxS1j3YirRSraPQ", + "title": "无限追问?AI交互式搜索工具Flowith,内置MJ、GPT、SD等多种产品,功能齐全!" + }, + { + "url": "https://mp.weixin.qq.com/s/FbHTyHqEBJT-1PhA5x7FRg", + "title": "Linux之父讽刺AI炒作:很搞笑,大概我也会被大模型取代" + }, + { + "url": "https://mp.weixin.qq.com/s/e2n4ttcT8raDU877t53GPQ", + "title": "Llama 3超大杯有何惊喜?Meta会一直开源吗?当初为何笃信元宇宙?扎克伯格新访谈回应一切" + }, + { + "url": "https://mp.weixin.qq.com/s/ixGAGRp9cdcPza45vORQ6w", + "title": "再见!波士顿动力人形机器人Atlas​" + } +] diff --git a/aero/data/unstructured-io.github.io.json b/aero/data/unstructured-io.github.io.json new file mode 100644 index 000000000..aad8bf22e --- /dev/null +++ b/aero/data/unstructured-io.github.io.json @@ -0,0 +1,474 @@ +[ + { + "url": "https://unstructured-io.github.io/unstructured/introduction.html", + "title": "Introduction - Unstructured 0.13.0 documentation" + }, + { + "url": "https://unstructured-io.github.io/unstructured/best_practices/table_extraction_pdf.html", + "title": "Table Extraction from PDF - Unstructured 0.13.0 documentation" + }, + { + "url": "https://unstructured-io.github.io/unstructured/best_practices/strategies.html", + "title": "Strategies - Unstructured 0.13.0 documentation" + }, + { + "url": "https://unstructured-io.github.io/unstructured/best_practices.html", + "title": "Best Practices - Unstructured 0.13.0 documentation" + }, + { + "url": "https://unstructured-io.github.io/unstructured/examples/chroma.html", + "title": "Data Processing into Vector Database - Unstructured 0.13.0 documentation" + }, + { + "url": "https://unstructured-io.github.io/unstructured/examples/dict_to_elements.html", + "title": "Multi-files API Processing - Unstructured 0.13.0 documentation" + }, + { + "url": "https://unstructured-io.github.io/unstructured/examples/databricks.html", + "title": "Delta Table Source Connector - Unstructured 0.13.0 documentation" + }, + { + "url": "https://unstructured-io.github.io/unstructured/integrations.html", + "title": "Integrations - Unstructured 0.13.0 documentation" + }, + { + "url": "https://unstructured-io.github.io/unstructured/best_practices/models.html", + "title": "Models - Unstructured 0.13.0 documentation" + }, + { + "url": "https://unstructured-io.github.io/unstructured/examples.html", + "title": "Examples - Unstructured 0.13.0 documentation" + }, + { + "url": "https://unstructured-io.github.io/unstructured/metadata.html", + "title": "Metadata - Unstructured 0.13.0 documentation" + }, + { + "url": "https://unstructured-io.github.io/unstructured/ingest/configs/fsspec_config.html", + "title": "Fsspec Configuration - Unstructured 0.13.0 documentation" + }, + { + "url": "https://unstructured-io.github.io/unstructured/ingest/configs/embedding_config.html", + "title": "Embedding Configuration - Unstructured 0.13.0 documentation" + }, + { + "url": "https://unstructured-io.github.io/unstructured/ingest/configs/chunking_config.html", + "title": "Chunking Configuration - Unstructured 0.13.0 documentation" + }, + { + "url": "https://unstructured-io.github.io/unstructured/ingest/configs/retry_strategy_config.html", + "title": "Retry Strategy Configuration - Unstructured 0.13.0 documentation" + }, + { + "url": "https://unstructured-io.github.io/unstructured/ingest/configs/permissions_config.html", + "title": "Permissions Configuration - Unstructured 0.13.0 documentation" + }, + { + "url": "https://unstructured-io.github.io/unstructured/ingest/configs/processor_config.html", + "title": "Processor Configuration - Unstructured 0.13.0 documentation" + }, + { + "url": "https://unstructured-io.github.io/unstructured/ingest/configs/partition_config.html", + "title": "Partition Configuration - Unstructured 0.13.0 documentation" + }, + { + "url": "https://unstructured-io.github.io/unstructured/ingest/configs/read_config.html", + "title": "Read Configuration - Unstructured 0.13.0 documentation" + }, + { + "url": "https://unstructured-io.github.io/unstructured/ingest/configs.html", + "title": "Ingest Configuration - Unstructured 0.13.0 documentation" + }, + { + "url": "https://unstructured-io.github.io/unstructured/ingest/destination_connectors/weaviate.html", + "title": "Weaviate - Unstructured 0.13.0 documentation" + }, + { + "url": "https://unstructured-io.github.io/unstructured/ingest/destination_connectors/vectara.html", + "title": "Vectara - Unstructured 0.13.0 documentation" + }, + { + "url": "https://unstructured-io.github.io/unstructured/ingest/destination_connectors/s3.html", + "title": "S3 - Unstructured 0.13.0 documentation" + }, + { + "url": "https://unstructured-io.github.io/unstructured/ingest/destination_connectors/sql.html", + "title": "SQL - Unstructured 0.13.0 documentation" + }, + { + "url": "https://unstructured-io.github.io/unstructured/ingest/destination_connectors/opensearch.html", + "title": "OpenSearch - Unstructured 0.13.0 documentation" + }, + { + "url": "https://unstructured-io.github.io/unstructured/ingest/destination_connectors/qdrant.html", + "title": "Qdrant - Unstructured 0.13.0 documentation" + }, + { + "url": "https://unstructured-io.github.io/unstructured/ingest/destination_connectors/mongodb.html", + "title": "MongoDB - Unstructured 0.13.0 documentation" + }, + { + "url": "https://unstructured-io.github.io/unstructured/ingest/destination_connectors/gcs.html", + "title": "Google Cloud Service - Unstructured 0.13.0 documentation" + }, + { + "url": "https://unstructured-io.github.io/unstructured/ingest/destination_connectors/pinecone.html", + "title": "Pinecone - Unstructured 0.13.0 documentation" + }, + { + "url": "https://unstructured-io.github.io/unstructured/ingest/destination_connectors/elasticsearch.html", + "title": "Elasticsearch - Unstructured 0.13.0 documentation" + }, + { + "url": "https://unstructured-io.github.io/unstructured/ingest/destination_connectors/dropbox.html", + "title": "Dropbox - Unstructured 0.13.0 documentation" + }, + { + "url": "https://unstructured-io.github.io/unstructured/ingest/destination_connectors/databricks_volumes.html", + "title": "Databricks Volumes - Unstructured 0.13.0 documentation" + }, + { + "url": "https://unstructured-io.github.io/unstructured/ingest/destination_connectors/delta_table.html", + "title": "Delta Table - Unstructured 0.13.0 documentation" + }, + { + "url": "https://unstructured-io.github.io/unstructured/ingest/destination_connectors/clarifai.html", + "title": "Clarifai - Unstructured 0.13.0 documentation" + }, + { + "url": "https://unstructured-io.github.io/unstructured/ingest/destination_connectors/chroma.html", + "title": "Chroma - Unstructured 0.13.0 documentation" + }, + { + "url": "https://unstructured-io.github.io/unstructured/ingest/destination_connectors/box.html", + "title": "Box - Unstructured 0.13.0 documentation" + }, + { + "url": "https://unstructured-io.github.io/unstructured/ingest/destination_connectors/azure_cognitive_search.html", + "title": "Azure Cognitive Search - Unstructured 0.13.0 documentation" + }, + { + "url": "https://unstructured-io.github.io/unstructured/ingest/destination_connectors/azure.html", + "title": "Azure - Unstructured 0.13.0 documentation" + }, + { + "url": "https://unstructured-io.github.io/unstructured/ingest/source_connectors/slack.html", + "title": "Slack - Unstructured 0.13.0 documentation" + }, + { + "url": "https://unstructured-io.github.io/unstructured/ingest/destination_connectors.html", + "title": "Destination Connectors - Unstructured 0.13.0 documentation" + }, + { + "url": "https://unstructured-io.github.io/unstructured/ingest/source_connectors/wikipedia.html", + "title": "Wikipedia - Unstructured 0.13.0 documentation" + }, + { + "url": "https://unstructured-io.github.io/unstructured/ingest/source_connectors/sharepoint.html", + "title": "Sharepoint - Unstructured 0.13.0 documentation" + }, + { + "url": "https://unstructured-io.github.io/unstructured/ingest/destination_connectors/astra.html", + "title": "Astra - Unstructured 0.13.0 documentation" + }, + { + "url": "https://unstructured-io.github.io/unstructured/ingest/source_connectors/sftp.html", + "title": "Sftp - Unstructured 0.13.0 documentation" + }, + { + "url": "https://unstructured-io.github.io/unstructured/ingest/source_connectors/salesforce.html", + "title": "Salesforce - Unstructured 0.13.0 documentation" + }, + { + "url": "https://unstructured-io.github.io/unstructured/ingest/source_connectors/s3.html", + "title": "S3 - Unstructured 0.13.0 documentation" + }, + { + "url": "https://unstructured-io.github.io/unstructured/ingest/source_connectors/reddit.html", + "title": "Reddit - Unstructured 0.13.0 documentation" + }, + { + "url": "https://unstructured-io.github.io/unstructured/ingest/source_connectors/outlook.html", + "title": "Outlook - Unstructured 0.13.0 documentation" + }, + { + "url": "https://unstructured-io.github.io/unstructured/ingest/source_connectors/opensearch.html", + "title": "OpenSearch - Unstructured 0.13.0 documentation" + }, + { + "url": "https://unstructured-io.github.io/unstructured/ingest/source_connectors/onedrive.html", + "title": "One Drive - Unstructured 0.13.0 documentation" + }, + { + "url": "https://unstructured-io.github.io/unstructured/ingest/source_connectors/mongodb.html", + "title": "MongoDB - Unstructured 0.13.0 documentation" + }, + { + "url": "https://unstructured-io.github.io/unstructured/ingest/source_connectors/notion.html", + "title": "Notion - Unstructured 0.13.0 documentation" + }, + { + "url": "https://unstructured-io.github.io/unstructured/ingest/source_connectors/google_cloud_storage.html", + "title": "Google Cloud Storage - Unstructured 0.13.0 documentation" + }, + { + "url": "https://unstructured-io.github.io/unstructured/ingest/source_connectors/github.html", + "title": "Github - Unstructured 0.13.0 documentation" + }, + { + "url": "https://unstructured-io.github.io/unstructured/ingest/source_connectors/local_connector.html", + "title": "Local - Unstructured 0.13.0 documentation" + }, + { + "url": "https://unstructured-io.github.io/unstructured/ingest/source_connectors/dropbox.html", + "title": "Dropbox - Unstructured 0.13.0 documentation" + }, + { + "url": "https://unstructured-io.github.io/unstructured/ingest/source_connectors/google_drive.html", + "title": "Google Drive - Unstructured 0.13.0 documentation" + }, + { + "url": "https://unstructured-io.github.io/unstructured/ingest/source_connectors/jira.html", + "title": "Jira - Unstructured 0.13.0 documentation" + }, + { + "url": "https://unstructured-io.github.io/unstructured/ingest/source_connectors/elasticsearch.html", + "title": "Elasticsearch - Unstructured 0.13.0 documentation" + }, + { + "url": "https://unstructured-io.github.io/unstructured/ingest/source_connectors/gitlab.html", + "title": "Gitlab - Unstructured 0.13.0 documentation" + }, + { + "url": "https://unstructured-io.github.io/unstructured/ingest/source_connectors/discord.html", + "title": "Discord - Unstructured 0.13.0 documentation" + }, + { + "url": "https://unstructured-io.github.io/unstructured/ingest/source_connectors/delta_table.html", + "title": "Delta Table - Unstructured 0.13.0 documentation" + }, + { + "url": "https://unstructured-io.github.io/unstructured/ingest/source_connectors/confluence.html", + "title": "Confluence - Unstructured 0.13.0 documentation" + }, + { + "url": "https://unstructured-io.github.io/unstructured/ingest/source_connectors/biomed.html", + "title": "Biomed - Unstructured 0.13.0 documentation" + }, + { + "url": "https://unstructured-io.github.io/unstructured/ingest/source_connectors/box.html", + "title": "Box - Unstructured 0.13.0 documentation" + }, + { + "url": "https://unstructured-io.github.io/unstructured/ingest/source_connectors/azure.html", + "title": "Azure - Unstructured 0.13.0 documentation" + }, + { + "url": "https://unstructured-io.github.io/unstructured/ingest/source_connectors/airtable.html", + "title": "Airtable - Unstructured 0.13.0 documentation" + }, + { + "url": "https://unstructured-io.github.io/unstructured/ingest/source_connectors.html", + "title": "Source Connectors - Unstructured 0.13.0 documentation" + }, + { + "url": "https://unstructured-io.github.io/unstructured/core/embedding.html", + "title": "Embedding - Unstructured 0.13.0 documentation" + }, + { + "url": "https://unstructured-io.github.io/unstructured/ingest/index.html", + "title": "Ingest - Unstructured 0.13.0 documentation" + }, + { + "url": "https://unstructured-io.github.io/unstructured/core/chunking.html", + "title": "Chunking - Unstructured 0.13.0 documentation" + }, + { + "url": "https://unstructured-io.github.io/unstructured/core/extracting.html", + "title": "Extracting - Unstructured 0.13.0 documentation" + }, + { + "url": "https://unstructured-io.github.io/unstructured/core/staging.html", + "title": "Staging - Unstructured 0.13.0 documentation" + }, + { + "url": "https://unstructured-io.github.io/unstructured/core/partition.html", + "title": "Partitioning - Unstructured 0.13.0 documentation" + }, + { + "url": "https://unstructured-io.github.io/unstructured/platforms/platform_destinations/weaviate.html", + "title": "Weaviate - Unstructured 0.13.0 documentation" + }, + { + "url": "https://unstructured-io.github.io/unstructured/core/cleaning.html", + "title": "Cleaning - Unstructured 0.13.0 documentation" + }, + { + "url": "https://unstructured-io.github.io/unstructured/core.html", + "title": "Core Functionality - Unstructured 0.13.0 documentation" + }, + { + "url": "https://unstructured-io.github.io/unstructured/platforms/platform_destinations/postgresql.html", + "title": "PostgreSQL - Unstructured 0.13.0 documentation" + }, + { + "url": "https://unstructured-io.github.io/unstructured/platforms/platform_destinations/pinecone.html", + "title": "Pinecone - Unstructured 0.13.0 documentation" + }, + { + "url": "https://unstructured-io.github.io/unstructured/platforms/platform_destinations/opensearch.html", + "title": "OpenSearch - Unstructured 0.13.0 documentation" + }, + { + "url": "https://unstructured-io.github.io/unstructured/platforms/platform_destinations/mongodb.html", + "title": "MongoDB - Unstructured 0.13.0 documentation" + }, + { + "url": "https://unstructured-io.github.io/unstructured/platforms/platform_destinations/google_cloud_destination.html", + "title": "Google Cloud Storage - Unstructured 0.13.0 documentation" + }, + { + "url": "https://unstructured-io.github.io/unstructured/platforms/platform_destinations/elasticsearch_destination.html", + "title": "Elasticsearch - Unstructured 0.13.0 documentation" + }, + { + "url": "https://unstructured-io.github.io/unstructured/platforms/platform_destinations/databricks.html", + "title": "Databricks - Unstructured 0.13.0 documentation" + }, + { + "url": "https://unstructured-io.github.io/unstructured/platforms/platform_destinations/azure_cognitive_search.html", + "title": "Azure Cognitive Search - Unstructured 0.13.0 documentation" + }, + { + "url": "https://unstructured-io.github.io/unstructured/platforms/platform_destinations/chroma.html", + "title": "Chroma - Unstructured 0.13.0 documentation" + }, + { + "url": "https://unstructured-io.github.io/unstructured/platforms/platform_sources/sftp.html", + "title": "SFTP Storage - Unstructured 0.13.0 documentation" + }, + { + "url": "https://unstructured-io.github.io/unstructured/platforms/destination_platform.html", + "title": "Platform Destination Connectors - Unstructured 0.13.0 documentation" + }, + { + "url": "https://unstructured-io.github.io/unstructured/platforms/platform_destinations/amazon_s3_destination.html", + "title": "Amazon S3 - Unstructured 0.13.0 documentation" + }, + { + "url": "https://unstructured-io.github.io/unstructured/platforms/platform_sources/salesforce.html", + "title": "Salesforce - Unstructured 0.13.0 documentation" + }, + { + "url": "https://unstructured-io.github.io/unstructured/platforms/platform_sources/sharepoint.html", + "title": "Sharepoint - Unstructured 0.13.0 documentation" + }, + { + "url": "https://unstructured-io.github.io/unstructured/platforms/platform_sources/opensearch.html", + "title": "OpenSearch - Unstructured 0.13.0 documentation" + }, + { + "url": "https://unstructured-io.github.io/unstructured/platforms/platform_sources/onedrive.html", + "title": "OneDrive Cloud Storage - Unstructured 0.13.0 documentation" + }, + { + "url": "https://unstructured-io.github.io/unstructured/platforms/platform_sources/google_cloud_source.html", + "title": "Google Cloud Storage - Unstructured 0.13.0 documentation" + }, + { + "url": "https://unstructured-io.github.io/unstructured/platforms/platform_sources/google_drive.html", + "title": "Google Drive - Unstructured 0.13.0 documentation" + }, + { + "url": "https://unstructured-io.github.io/unstructured/platforms/platform_sources/azure_blob.html", + "title": "Azure Blob Storage - Unstructured 0.13.0 documentation" + }, + { + "url": "https://unstructured-io.github.io/unstructured/platforms/platform_sources/amazon_s3_source.html", + "title": "Amazon S3 - Unstructured 0.13.0 documentation" + }, + { + "url": "https://unstructured-io.github.io/unstructured/platforms/platform_sources/elasticsearch_source.html", + "title": "Elasticsearch - Unstructured 0.13.0 documentation" + }, + { + "url": "https://unstructured-io.github.io/unstructured/platforms/source_platform.html", + "title": "Platform Source Connectors - Unstructured 0.13.0 documentation" + }, + { + "url": "https://unstructured-io.github.io/unstructured/platforms/job.html", + "title": "Jobs Scheduling - Unstructured 0.13.0 documentation" + }, + { + "url": "https://unstructured-io.github.io/unstructured/platforms/workflow.html", + "title": "Workflows Automation - Unstructured 0.13.0 documentation" + }, + { + "url": "https://unstructured-io.github.io/unstructured/platform.html", + "title": "Unstructured Platform - Unstructured 0.13.0 documentation" + }, + { + "url": "https://unstructured-io.github.io/unstructured/apis/validation_errors.html", + "title": "API Validation Errors - Unstructured 0.13.0 documentation" + }, + { + "url": "https://unstructured-io.github.io/unstructured/apis/api_parameters.html", + "title": "API Parameters - Unstructured 0.13.0 documentation" + }, + { + "url": "https://unstructured-io.github.io/unstructured/apis/usage_methods.html", + "title": "Accessing Unstructured API - Unstructured 0.13.0 documentation" + }, + { + "url": "https://unstructured-io.github.io/unstructured/apis/api_sdks.html", + "title": "Python and JavaScript SDK - Unstructured 0.13.0 documentation" + }, + { + "url": "https://unstructured-io.github.io/unstructured/apis/aws_marketplace.html", + "title": "AWS Marketplace Deployment Guide - Unstructured 0.13.0 documentation" + }, + { + "url": "https://unstructured-io.github.io/unstructured/apis/azure_marketplace.html", + "title": "Azure Marketplace Deployment Guide - Unstructured 0.13.0 documentation" + }, + { + "url": "https://unstructured-io.github.io/unstructured/api.html", + "title": "Unstructured API Services - Unstructured 0.13.0 documentation" + }, + { + "url": "https://unstructured-io.github.io/unstructured/apis/saas_api.html", + "title": "SaaS API Deployment Guide - Unstructured 0.13.0 documentation" + }, + { + "url": "https://unstructured-io.github.io/unstructured/introduction.html", + "title": "Introduction - Unstructured 0.13.0 documentation" + }, + { + "url": "https://unstructured-io.github.io/unstructured/installing.html", + "title": "Installation - Unstructured 0.13.0 documentation" + }, + { + "url": "https://unstructured-io.github.io/unstructured/installation/docker.html", + "title": "Docker Installation - Unstructured 0.13.0 documentation" + }, + { + "url": "https://unstructured-io.github.io/unstructured/installation/full_installation.html", + "title": "Full Installation - Unstructured 0.13.0 documentation" + }, + { + "url": "https://unstructured-io.github.io/unstructured/introduction/key_concepts.html", + "title": "Key Concepts - Unstructured 0.13.0 documentation" + }, + { + "url": "https://unstructured-io.github.io/unstructured/introduction/overview.html", + "title": "Document Elements - Unstructured 0.13.0 documentation" + }, + { + "url": "https://unstructured-io.github.io/unstructured/introduction/getting_started.html", + "title": "Quick Start - Unstructured 0.13.0 documentation" + }, + { + "url": "https://unstructured-io.github.io/unstructured/index.html", + "title": "Unstructured 0.13.0 documentation" + } +] diff --git a/aero/main.py b/aero/main.py index aa70eeba5..9a308b1d6 100644 --- a/aero/main.py +++ b/aero/main.py @@ -10,7 +10,7 @@ _cleanup_coroutines = [] -class Greeter(aero_pb2_grpc.AeroServicer): +class AeroService(aero_pb2_grpc.AeroServicer): async def ParseHTML( self, request: aero_pb2.ParseHTMLRequest, @@ -32,7 +32,7 @@ async def ParseHTML( async def serve() -> None: server = grpc.aio.server() - aero_pb2_grpc.add_AeroServicer_to_server(Greeter(), server) + aero_pb2_grpc.add_AeroServicer_to_server(AeroService(), server) listen_addr = "[::]:50051" server.add_insecure_port(listen_addr) logging.info("Starting server on %s", listen_addr) diff --git a/aero/md.py b/aero/md.py new file mode 100644 index 000000000..85e60469f --- /dev/null +++ b/aero/md.py @@ -0,0 +1,24 @@ +from unstructured.partition.md import partition_md +from langchain_text_splitters import MarkdownHeaderTextSplitter + + +# load documents +headers_to_split_on = [ + ("#", "Header 1"), + ("##", "Header 2"), + ("###", "Header 3"), +] + +with open('./jina_data/intro.md') as fp: + markdown_document = fp.read() + +markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on, strip_headers=False) +md_header_splits = markdown_splitter.split_text(markdown_document) + +for split in md_header_splits: + print(split.page_content) + print("~" * 180) + +# for chunk in chunks: +# print(chunk.text) +# print("-" * 80) diff --git a/aero/prepare.py b/aero/prepare.py new file mode 100644 index 000000000..3dadafb84 --- /dev/null +++ b/aero/prepare.py @@ -0,0 +1,21 @@ +from unstructured.partition.html import partition_html +from unstructured.cleaners.core import clean +from unstructured.chunking.title import chunk_by_title + +# url = "https://unstructured-io.github.io/unstructured/introduction.html" +chunks = partition_html(url='https://refly.ai/', chunking_strategy='by_title', max_characters=1000, new_after_n_chars=800) +# chunks = partition_html( +# filename="./html_files/https_unstructured-io.github.io_unstructured_introduction.html.html", +# ) + +chunks = [chunk for chunk in chunks if len(chunk.text.split()) > 10] + +chunks = chunk_by_title(chunks, max_characters=1200, new_after_n_chars=800) + +# chunks = partition_html(url=url, chunking_strategy='basic', max_characters=1000, new_after_n_chars=800, overlap=400) + +# print([clean(chunk.text, extra_whitespace=True) for chunk in chunks]) +for chunk in chunks: + # print(chunk.to_dict()) + print(clean(chunk.text, extra_whitespace=True)) + print("-" * 50) diff --git a/aero/requirements.txt b/aero/requirements.txt deleted file mode 100644 index 7f830cf7b..000000000 --- a/aero/requirements.txt +++ /dev/null @@ -1,5 +0,0 @@ -grpcio==1.62.1 -grpcio-tools==1.62.1 -lxml==5.2.1 -protobuf==4.25.3 -setuptools==69.5.1 diff --git a/aero/scripts/scrape.py b/aero/scripts/scrape.py new file mode 100644 index 000000000..62517fe16 --- /dev/null +++ b/aero/scripts/scrape.py @@ -0,0 +1,27 @@ +import scrapy +import json + + +class ReflySpider(scrapy.Spider): + name = "refly_spider" + allowed_domains = ["unstructured-io.github.io"] + start_urls = ["https://unstructured-io.github.io/unstructured/introduction.html"] + + def __init__(self, *args, **kwargs): + super(ReflySpider, self).__init__(*args, **kwargs) + self.data = [] + + def parse(self, response): + self.data.append( + { + "url": response.url, + "title": response.css("title::text").get(), + } + ) + + for link in response.css("a::attr(href)"): + yield response.follow(link.get(), callback=self.parse) + + def close(self, reason): + with open(f"{self.allowed_domains[0]}.json", "w") as f: + json.dump(self.data, f, indent=2) diff --git a/aero/split_by_token.py b/aero/split_by_token.py new file mode 100644 index 000000000..199ea1a28 --- /dev/null +++ b/aero/split_by_token.py @@ -0,0 +1,13 @@ +with open("./jina_data/mp.md") as f: + text = f.read() + +from langchain_text_splitters import CharacterTextSplitter + +text_splitter = CharacterTextSplitter.from_tiktoken_encoder( + encoding_name="cl100k_base", chunk_size=800, chunk_overlap=400 +) +texts = text_splitter.split_text(text) + +for text in texts: + print(text) + print("=" * 280) \ No newline at end of file diff --git a/aero/store_link.py b/aero/store_link.py new file mode 100644 index 000000000..70af518a5 --- /dev/null +++ b/aero/store_link.py @@ -0,0 +1,143 @@ +import pickle +import os +import json +import requests + +import weaviate +import weaviate.classes as wvc +from weaviate.classes.query import MetadataQuery, HybridFusion +from weaviate.auth import AuthApiKey +from weaviate.util import generate_uuid5 +from weaviate.classes.config import Configure, Property, DataType + +from langchain_text_splitters import CharacterTextSplitter + +COLLECTION_NAME = "Refly" +READER_URL = "https://r.jina.ai/" + +client = weaviate.connect_to_wcs( + cluster_url=os.environ['WEAVIATE_CLUSTER_URL'], + auth_credentials=AuthApiKey(os.environ['WEAVIATE_API_KEY']), + skip_init_checks=True, + headers={ + "X-OpenAI-Api-Key": os.getenv("OPENAI_API_KEY"), + } +) + + +def init_collection(): + if client.collections.exists(COLLECTION_NAME): + return + + client.collections.create( + COLLECTION_NAME, + properties=[ + Property(name="url", data_type=DataType.TEXT), + Property(name="type", data_type=DataType.TEXT), + Property( + name="title", + data_type=DataType.TEXT, + index_filterable=True, + index_searchable=True, + ), + Property( + name="content", + data_type=DataType.TEXT, + index_filterable=True, + index_searchable=True, + ), + ], + vectorizer_config=[ + Configure.NamedVectors.text2vec_openai( + name="content", + model="text-embedding-3-large", + dimensions=256, + source_properties=["content"], + ) + ], + ) + + +def store_link(url: str, title: str): + resp = requests.get(READER_URL + url) + text_splitter = CharacterTextSplitter.from_tiktoken_encoder( + encoding_name="cl100k_base", chunk_size=800, chunk_overlap=400 + ) + texts = text_splitter.split_text(resp.text) + print(f'processing {url} with {len(texts)} chunks') + + collection = client.collections.get("Refly") + data_objects = list() + + for chunk in texts: + properties = { + "url": url, + "type": "weblink", + "title": title, + "content": chunk, + } + # print(properties) + data_objects.append( + wvc.data.DataObject(properties=properties, uuid=generate_uuid5(properties)) + ) + + res = collection.data.insert_many(data_objects) + print(res) + + +def load_html_from_local_pickle(fpath: str): + with open(fpath, "rb") as f: + data = pickle.load(f) + + for item in data[:2]: + print(item) + store_link(item["url"], item["title"], item["html"]) + + +def load_html_from_local_json(fpath: str): + with open(fpath, "r") as f: + data = json.load(f) + + for item in data: + store_link(item["url"], item["title"]) + + +def read_data(): + collection = client.collections.get(COLLECTION_NAME) + + for item in collection.iterator(include_vector=True): + print(item.uuid, len(item.vector['content'])) + + +def search_data(query: str): + collection = client.collections.get(COLLECTION_NAME) + response = collection.query.hybrid( + query=query, + limit=3, + return_metadata=MetadataQuery(score=True, explain_score=True), + ) + + for o in response.objects: + print(o.properties) + print(o.metadata.score, o.metadata.explain_score) + + +def delete_data_objects(): + collection = client.collections.get(COLLECTION_NAME) + + for item in collection.iterator(): + print('delete', item.uuid) + collection.data.delete_by_id(item.uuid) + + +if __name__ == "__main__": + try: + init_collection() + # store_link('https://unstructured-io.github.io/unstructured/core/chunking.html', 'chunking') + # delete_data_objects() + # load_html_from_local_json('./data/mp.weixin.json') + search_data('Linus') + # load_html_from_local_pickle('./unstructured-io.github.io.pickle') + # read_data() + finally: + client.close() diff --git a/reflyd/package.json b/reflyd/package.json index 0efb4018a..49f526a66 100644 --- a/reflyd/package.json +++ b/reflyd/package.json @@ -42,11 +42,14 @@ "@opentelemetry/auto-instrumentations-node": "^0.44.0", "@opentelemetry/exporter-trace-otlp-http": "^0.50.0", "@opentelemetry/sdk-node": "^0.50.0", + "@paralleldrive/cuid2": "^2.2.2", "@prisma/client": "5", "@prisma/instrumentation": "^5.12.1", + "avsc": "^5.7.7", "bull": "^4.12.2", "cheerio": "^1.0.0-rc.12", "cookie-parser": "~1.4.6", + "graphql": "^16.8.1", "helmet": "^7.1.0", "langchain": "^0.1.21", "lodash.omit": "^4.5.0", @@ -66,6 +69,7 @@ "redis": "^4.6.13", "reflect-metadata": "^0.1.13", "rxjs": "^7.2.0", + "weaviate-ts-client": "^2.1.1", "zod": "^3.22.4" }, "devDependencies": { diff --git a/reflyd/prisma/schema.prisma b/reflyd/prisma/schema.prisma index 40b5ac1d4..1af586069 100644 --- a/reflyd/prisma/schema.prisma +++ b/reflyd/prisma/schema.prisma @@ -41,6 +41,8 @@ model VerificationToken { model User { /// 主键 id Int @id @default(autoincrement()) + /// UID + uid String? @default("") @map("uid") /// 头像 avatar String? @map("avatar") /// 用户名 @@ -67,6 +69,8 @@ model User { model Topic { /// 主键 id Int @id @default(autoincrement()) + /// 主题 id + topicId String? @default("") @map("topic_id") /// 主题 key key String @unique @map("key") /// 多语言名称 JSON 字符串 (key: 语言, val: 名称) @@ -107,8 +111,10 @@ model UserPreference { // 会话模型 model Conversation { - /// id为主键 + /// 主键 id Int @id @default(autoincrement()) + /// 对话id + convId String? @default("") @map("conv_id") /// 用户id userId Int @map("user_id") /// 内容 id,指向 aigc_content 中的内容 @@ -140,6 +146,8 @@ model Conversation { model ChatMessage { /// id为主键 id Int @id @default(autoincrement()) + /// 消息id + msgId String? @default("") @map("msg_id") /// 会话id conversationId Int @map("conversation_id") /// 消息来源 @@ -203,29 +211,59 @@ model UserWeblink { model Weblink { /// id为主键 - id Int @id @default(autoincrement()) + id Int @id @default(autoincrement()) + /// 网页id + linkId String? @default("") @map("link_id") /// 网页链接 - url String @unique @map("url") + url String @unique @map("url") /// 页面内容 (deprecated) - pageContent String @map("page_content") + pageContent String @map("page_content") /// 对象存储 key - storageKey String @default("") @map("storage_key") + storageKey String @default("") @map("storage_key") /// 页面元数据, JSON 存储 - pageMeta String @map("page_meta") + pageMeta String @map("page_meta") /// 内容元数据, JSON 存储 - contentMeta String @map("content_meta") + contentMeta String @map("content_meta") /// 索引状态 - indexStatus IndexStatus @default(init) @map("index_status") + indexStatus IndexStatus @default(init) @map("index_status") + /// 解析策略版本,格式统一为 YYYYMMDD + parserVersion String? @default("00000000") @map("parser_version") + /// 解析后内容存储 key + parsedDocStorageKey String? @default("") @map("parsed_doc_storage_key") + /// 切块后数据存储 key + chunkStorageKey String? @default("") @map("chunk_storage_key") + /// 上次解析时间 + lastParseTime DateTime? @default(now()) @map("last_parse_time") @db.Timestamptz() /// 创建时间 - createdAt DateTime @default(now()) @map("created_at") @db.Timestamptz() + createdAt DateTime @default(now()) @map("created_at") @db.Timestamptz() /// 更新时间 - updatedAt DateTime @updatedAt @map("updated_at") @db.Timestamptz() + updatedAt DateTime @updatedAt @map("updated_at") @db.Timestamptz() contents AigcContent[] @@map("weblinks") } +model WeblinkLog { + /// id为主键 + id Int @id @default(autoincrement()) + /// 关联的链接 id + weblinkId Int @map("weblink_id") + /// 用户id + userId Int @map("user_id") + /// 解析策略版本 + parserVersion String @default("00000000") @map("parser_version") + /// 访问时间 + visitTime DateTime @default(now()) @map("visit_time") @db.Timestamptz() + /// 创建时间 + createdAt DateTime @default(now()) @map("created_at") @db.Timestamptz() + /// 更新时间 + updatedAt DateTime @updatedAt @map("updated_at") @db.Timestamptz() + + @@index([weblinkId, visitTime]) + @@map("weblink_log") +} + model WeblinkUserMark { /// id为主键 id Int @id @default(autoincrement()) @@ -255,6 +293,8 @@ model WeblinkUserMark { model AigcContent { /// 主键 id Int @id @default(autoincrement()) + /// 内容id + cid String? @default("") @map("cid") /// 标题 title String @map("title") /// 摘要 @@ -295,6 +335,8 @@ model AigcContent { model UserDigest { /// 主键 id Int @id @default(autoincrement()) + /// 摘要 id + digestId String? @default("") @map("digest_id") /// 用户 id userId Int @map("user_id") /// 日期 (YYYY-MM-DD) diff --git a/reflyd/scripts/hybrid-search.ts b/reflyd/scripts/hybrid-search.ts new file mode 100644 index 000000000..515fcfde7 --- /dev/null +++ b/reflyd/scripts/hybrid-search.ts @@ -0,0 +1,68 @@ +import { MilvusClient } from '@zilliz/milvus2-sdk-node'; +import { DataType } from '@zilliz/milvus2-sdk-node'; + +const address = 'localhost:19530'; +const token = 'root:Milvus'; +const ssl = false; +const milvusClient = new MilvusClient({ address, ssl, token }); + +const params = { + collection_name: 'refly', + description: 'Refly Content Search', + fields: [ + { + name: 'url', + description: 'weblink url', + data_type: DataType.VarChar, + }, + { + name: 'title', + description: 'weblink title', + data_type: DataType.VarChar, + max_length: 500, + }, + { + name: 'type', + description: 'content type', + data_type: DataType.VarChar, + max_length: 100, + }, + { + name: 'chunk_id', + description: 'chunk id', + data_type: DataType.VarChar, + max_length: 100, + }, + { + name: 'content', + description: 'chunked content', + data_type: DataType.VarChar, + max_length: 10000, + }, + { + name: 'vector', + description: 'vector of chunked content', + data_type: DataType.FloatVector, + dim: 256, + }, + ], + enableDynamicField: true, +}; + +async function main() { + const res = await milvusClient.createCollection(params); + console.log('create collections status:', res); + + const loadRes = await milvusClient.loadCollection({ + collection_name: params.collection_name, + }); + console.log('load collections status:', loadRes); + + await milvusClient.hybridSearch({ + collection_name: 'refly', + anns_field: 'vector', + data: {}, + }); +} + +main(); diff --git a/reflyd/src/aigc/aigc.service.ts b/reflyd/src/aigc/aigc.service.ts index fab7ad9a5..020f59043 100644 --- a/reflyd/src/aigc/aigc.service.ts +++ b/reflyd/src/aigc/aigc.service.ts @@ -91,10 +91,7 @@ export class AigcService { return { ...content, inputs }; } - private async updateUserPreferences(param: { - uwb: UserWeblink; - meta: ContentMeta; - }) { + async updateUserPreferences(param: { uwb: UserWeblink; meta: ContentMeta }) { const { uwb, meta } = param; // 对于每一个标注的 topic,更新用户喜好 @@ -126,7 +123,7 @@ export class AigcService { * @param param * @returns */ - private async upsertUserDigest(param: { + async upsertUserDigest(param: { uwb: UserWeblink; content: AigcContent; meta: ContentMeta; @@ -135,50 +132,6 @@ export class AigcService { const { userId } = uwb; const today = new Date().toISOString().split('T')[0]; - // const digest = await this.prisma.userDigest.findUnique({ - // where: { - // userId_date_topicKey: { - // userId, - // date: today, - // topicKey: meta.topics[0].key, - // }, - // }, - // }); - - // 如果该 topic 下已有摘要,进行增量总结 - // if (digest) { - // const dContent = await this.prisma.aigcContent.findUnique({ - // where: { id: digest.contentId }, - // include: { inputs: true }, - // }); - - // // 如果该 digest 输入的 content 已包含新的 content,则不做任何增量总结 - // if (dContent.inputIds.includes(content.id)) { - // this.logger.log( - // `digest ${digest.id} already contains content ${content.id}`, - // ); - // return; - // } - - // const combinedContent = await this.llmService.summarizeMultipleWeblink([ - // ...dContent.inputs, - // content, - // ]); - - // // 更新 aigc 依赖关系 - // this.prisma.$transaction(async (tx) => { - // await tx.aigcContent.update({ - // where: { id: dContent.id }, - // data: { ...combinedContent, inputIds: { push: content.id } }, - // }); - // await tx.aigcContent.update({ - // where: { id: content.id }, - // data: { outputIds: { push: dContent.id } }, - // }); - // }); - - // return; - // } // 创建新的 digest 内容及其对应的记录 this.prisma.$transaction(async (tx) => { @@ -263,54 +216,6 @@ export class AigcService { }); } - // /** - // * Dispatch feed to target users. - // * @param content aigc content - // * @returns - // */ - // private async dispatchFeed(param: { - // weblink: Weblink; - // meta: ContentMeta; - // content: AIGCContent; - // }) { - // const { weblink, meta, content } = param; - - // // topic 管理先简单点,就用 key 去匹配 - // // 介绍文案先前端写死 - // // await this.ensureTopics(meta); - - // // Find users related to this content - // const userIds = await this.prisma.userPreference.findMany({ - // select: { userId: true }, - // where: { - // topicKey: { in: meta.topics.map((t) => t.key) }, - // score: { gte: 0 }, // TODO: 设计更合适的推荐门槛 - // }, - // }); - - // // Check if these users have read this source - // const readLinkUsers = await this.prisma.userWeblink.findMany({ - // select: { userId: true }, - // where: { - // url: weblink.url, - // userId: { in: userIds.map((u) => u.userId) }, - // }, - // }); - // const readUserSet = new Set(readLinkUsers.map((elem) => elem.userId)); - // const unreadUsers = userIds.filter((u) => !readUserSet.has(u.userId)); - - // // Add feed records for unread users - // if (unreadUsers.length > 0) { - // this.logger.log(`add feed ${content.id} to users: ${unreadUsers}`); - // await this.prisma.userFeed.createMany({ - // data: unreadUsers.map((u) => ({ - // userId: u.userId, - // contentId: content.id, - // })), - // }); - // } - // } - /** * 处理全局内容流程: 应用内容策略,分发 feed * @param doc @@ -318,11 +223,10 @@ export class AigcService { */ async runContentFlow(param: { doc: Document; - link: WebLinkDTO; uwb: UserWeblink; weblink: Weblink; }) { - const { doc, weblink, uwb } = param; + const { doc, uwb, weblink } = param; let meta: ContentMeta; if (!weblink.contentMeta) { @@ -335,14 +239,14 @@ export class AigcService { ); return; } - if (shouldRunIndexPipeline(meta)) { - await this.llmService.indexPipelineFromLink(weblink.id, doc); - } - await this.prisma.weblink.update({ - where: { id: weblink.id }, - data: { contentMeta: JSON.stringify(meta), indexStatus: 'finish' }, - }); - await this.ensureTopics(meta); + + await Promise.all([ + this.prisma.weblink.update({ + where: { id: weblink.id }, + data: { contentMeta: JSON.stringify(meta) }, + }), + this.ensureTopics(meta), + ]); } else { meta = JSON.parse(weblink.contentMeta); } @@ -367,7 +271,6 @@ export class AigcService { meta, user, }); - // await this.dispatchFeed({ ...param, meta, content }); await this.runUserContentFlow({ ...param, meta, content }); } @@ -400,7 +303,3 @@ export class AigcService { } } } - -function shouldRunIndexPipeline(meta: ContentMeta): boolean { - return true; -} diff --git a/reflyd/src/app.module.ts b/reflyd/src/app.module.ts index 36e7a4ba8..1b1778c88 100644 --- a/reflyd/src/app.module.ts +++ b/reflyd/src/app.module.ts @@ -9,6 +9,7 @@ import { UserModule } from './user/user.module'; import { LlmModule } from './llm/llm.module'; import { AccountModule } from './account/account.module'; import { WeblinkModule } from './weblink/weblink.module'; +import { RAGModule } from './rag/rag.module'; import { AigcModule } from './aigc/aigc.module'; import { ConversationModule } from './conversation/conversation.module'; @@ -57,6 +58,7 @@ import { AppController } from './app.controller'; WeblinkModule, LlmModule, AigcModule, + RAGModule, ], controllers: [AppController], }) diff --git a/reflyd/src/common/common.module.ts b/reflyd/src/common/common.module.ts index 1ae325afa..a1d4355f2 100644 --- a/reflyd/src/common/common.module.ts +++ b/reflyd/src/common/common.module.ts @@ -3,10 +3,11 @@ import { ConfigModule } from '@nestjs/config'; import { PrismaService } from './prisma.service'; import { LoggerService } from './logger.service'; import { MinioService } from './minio.service'; +import { WeaviateService } from './weaviate.service'; @Module({ imports: [ConfigModule], - providers: [PrismaService, MinioService, LoggerService], - exports: [PrismaService, MinioService, LoggerService], + providers: [PrismaService, MinioService, LoggerService, WeaviateService], + exports: [PrismaService, MinioService, LoggerService, WeaviateService], }) export class CommonModule {} diff --git a/reflyd/src/common/weaviate.dto.ts b/reflyd/src/common/weaviate.dto.ts new file mode 100644 index 000000000..fe7d8968c --- /dev/null +++ b/reflyd/src/common/weaviate.dto.ts @@ -0,0 +1,37 @@ +export enum ContentType { + 'weblink', +} + +export interface ContentDataObj { + id: string; + url: string; + type: ContentType; + title: string; + content: string; + vector: number[]; +} + +export interface MetadataFilter { + url?: string; +} + +export interface HybridSearchParam { + tenantId: string; + query: string; + vector?: number[]; + filter?: MetadataFilter; + limit?: number; +} + +export interface SearchMeta { + score: string; + explainScore: string; +} + +export interface SearchResult { + url: string; + type: ContentType; + title: string; + content: string; + _additional: SearchMeta; +} diff --git a/reflyd/src/common/weaviate.service.ts b/reflyd/src/common/weaviate.service.ts new file mode 100644 index 000000000..5018ea65a --- /dev/null +++ b/reflyd/src/common/weaviate.service.ts @@ -0,0 +1,121 @@ +import { Injectable, Logger, OnModuleInit } from '@nestjs/common'; +import { ConfigService } from '@nestjs/config'; +import weaviate, { + WeaviateClient, + ApiKey, + FusionType, +} from 'weaviate-ts-client'; + +import { + ContentDataObj, + HybridSearchParam, + SearchResult, +} from './weaviate.dto'; + +const reflyContentSchema = { + class: 'Content', + properties: [ + { + name: 'url', + dataType: ['text'], + }, + { + name: 'type', + dataType: ['text'], + }, + { + name: 'title', + dataType: ['text'], + }, + { + name: 'content', + dataType: ['text'], + }, + ], + multiTenancyConfig: { enabled: true }, +}; + +@Injectable() +export class WeaviateService implements OnModuleInit { + private readonly logger = new Logger(WeaviateService.name); + private client: WeaviateClient; + + constructor(private configService: ConfigService) { + this.client = weaviate.client({ + scheme: 'https', + host: this.configService.getOrThrow('vectorStore.host'), + apiKey: new ApiKey(this.configService.getOrThrow('vectorStore.apiKey')), + }); + } + + async onModuleInit() { + await this.ensureCollectionExists(); + } + + async ensureCollectionExists() { + let classDefinition = await this.client.schema + .classGetter() + .withClassName(reflyContentSchema.class) + .do(); + + if (!classDefinition) { + this.logger.log('class definition not found, create new one'); + classDefinition = await this.client.schema + .classCreator() + .withClass(reflyContentSchema) + .do(); + } + + this.logger.log( + 'collection definition: ' + JSON.stringify(classDefinition, null, 2), + ); + } + + async batchSaveData(tenantId: string, data: ContentDataObj[]) { + let batcher = this.client.batch.objectsBatcher(); + for (const obj of data) + batcher = batcher.withObject({ + class: reflyContentSchema.class, + properties: { + url: obj.url, + type: obj.type, + title: obj.title, + content: obj.content, + }, + id: obj.id, + vector: obj.vector, + tenant: tenantId, + }); + + // Flush + await batcher.do(); + } + + async hybridSearch(param: HybridSearchParam): Promise { + let getter = this.client.graphql + .get() + .withTenant(param.tenantId) + .withClassName(reflyContentSchema.class) + .withHybrid({ + query: param.query, + alpha: 0.5, + vector: param.vector, + fusionType: FusionType.rankedFusion, + }) + .withLimit(param.limit || 5) + .withFields('url type title content _additional { score explainScore }'); + + if (param.filter?.url) { + getter = getter.withWhere({ + path: ['url'], + operator: 'Equal', + valueText: param.filter.url, + }); + } + + const res = await getter.do(); + this.logger.log('hybrid search result: ' + JSON.stringify(res, null, 2)); + + return res.data?.Get?.[reflyContentSchema.class]; + } +} diff --git a/reflyd/src/config/app.config.ts b/reflyd/src/config/app.config.ts index 923c14a59..80c4584d1 100644 --- a/reflyd/src/config/app.config.ts +++ b/reflyd/src/config/app.config.ts @@ -13,7 +13,9 @@ export default () => ({ weblinkBucket: process.env.MINIO_WEBLINK_BUCKET || 'refly-weblink', }, vectorStore: { - vectorDim: parseInt(process.env.REFLY_VEC_DIM) || 1024, + host: process.env.WEAVIATE_INSTANCE_URL, + apiKey: process.env.WEAVIATE_API_KEY, + vectorDim: parseInt(process.env.REFLY_VEC_DIM) || 256, }, serper: { apiKey: process.env.SERPER_API_KEY, diff --git a/reflyd/src/conversation/conversation.controller.ts b/reflyd/src/conversation/conversation.controller.ts index 534878d78..698389c97 100644 --- a/reflyd/src/conversation/conversation.controller.ts +++ b/reflyd/src/conversation/conversation.controller.ts @@ -105,7 +105,7 @@ export class ConversationController { res.setHeader('Connection', 'keep-alive'); res.status(200); - await this.conversationService.chat(res, convId, req.user.id, body.task); + await this.conversationService.chat(res, req.user, convId, body.task); } @UseGuards(JwtAuthGuard) diff --git a/reflyd/src/conversation/conversation.service.ts b/reflyd/src/conversation/conversation.service.ts index 5027f7adc..46bc01d12 100644 --- a/reflyd/src/conversation/conversation.service.ts +++ b/reflyd/src/conversation/conversation.service.ts @@ -3,7 +3,7 @@ import { Response } from 'express'; import { PrismaService } from '../common/prisma.service'; import { CreateChatMessageInput, CreateConversationParam } from './dto'; -import { Prisma, ChatMessage } from '@prisma/client'; +import { Prisma, ChatMessage, User } from '@prisma/client'; import { LOCALE, QUICK_ACTION_TASK_PAYLOAD, @@ -93,7 +93,7 @@ export class ConversationService { }); } - async chat(res: Response, convId: number, userId: number, task: Task) { + async chat(res: Response, user: User, convId: number, task: Task) { const { taskType, data = {} } = task; const query = data?.question || ''; @@ -104,18 +104,18 @@ export class ConversationService { let taskRes: TaskResponse; if (taskType === TASK_TYPE.QUICK_ACTION) { - taskRes = await this.handleQuickActionTask(res, userId, task); + taskRes = await this.handleQuickActionTask(res, user, task); } else if (taskType === TASK_TYPE.SEARCH_ENHANCE_ASK) { taskRes = await this.handleSearchEnhanceTask(res, task, chatHistory); } else { - taskRes = await this.handleChatTask(res, userId, task, chatHistory); + taskRes = await this.handleChatTask(res, user, task, chatHistory); } res.end(``); const newMessages: CreateChatMessageInput[] = [ { type: 'human', - userId, + userId: user.id, conversationId: convId, content: query, sources: '', @@ -127,7 +127,7 @@ export class ConversationService { }, { type: 'ai', - userId, + userId: user.id, conversationId: convId, content: taskRes.answer, sources: JSON.stringify(taskRes.sources), @@ -147,17 +147,17 @@ export class ConversationService { async handleChatTask( res: Response, - userId: number, + user: User, task: Task, chatHistory: ChatMessage[], ): Promise { - const locale = task?.locale || LOCALE.EN; + const locale = task?.locale || (user.outputLocale as LOCALE) || LOCALE.EN; const filter: any = { must: [ { key: 'userId', - match: { value: userId }, + match: { value: user.uid }, }, ], }; @@ -193,10 +193,14 @@ export class ConversationService { ); const sources = chatFromClientSelector - ? await this.weblinkService.parseMultiWeblinks( + ? await this.weblinkService.readMultiWeblinks( task?.data?.filter?.weblinkList, ) - : await this.llmService.getRetrievalDocs(questionWithContext, filter); + : await this.llmService.getRetrievalDocs( + user.uid, + questionWithContext, + filter, + ); const { stream } = await this.llmService.chat( questionWithContext, @@ -300,7 +304,7 @@ export class ConversationService { async handleQuickActionTask( res: Response, - userId: number, + user: User, task: Task, ): Promise { const data = task?.data as QUICK_ACTION_TASK_PAYLOAD; @@ -328,13 +332,10 @@ export class ConversationService { if (weblinkList?.length <= 0) return; // save user mark for each weblink in a non-blocking style - this.weblinkService.saveWeblinkUserMarks({ - userId, - weblinkList, - }); + this.weblinkService.saveWeblinkUserMarks({ userId: user.id, weblinkList }); // 基于一组网页做总结,先获取网页内容 - const docs = await this.weblinkService.parseMultiWeblinks(weblinkList); + const docs = await this.weblinkService.readMultiWeblinks(weblinkList); let stream: IterableReadableStream; if (data?.actionType === QUICK_ACTION_TYPE.SUMMARY) { @@ -361,7 +362,7 @@ export class ConversationService { const getUserQuestion = (actionType: QUICK_ACTION_TYPE) => { switch (actionType) { case QUICK_ACTION_TYPE.SUMMARY: { - return '总结网页'; + return '总结网页'; // TODO: 国际化 } } }; diff --git a/reflyd/src/llm/llm.controller.ts b/reflyd/src/llm/llm.controller.ts index 3e535b0c5..4b5bdc163 100644 --- a/reflyd/src/llm/llm.controller.ts +++ b/reflyd/src/llm/llm.controller.ts @@ -29,7 +29,7 @@ export class LLMController { this.logger.log(`applyStrategy: ${body}`); const { url } = body; - const doc = await this.weblinkService.parseWebLinkContent(url); // 处理错误边界 + const doc = await this.weblinkService.readWebLinkContent(url); // 处理错误边界 const res = await this.llmService.applyStrategy(doc); return res; @@ -41,7 +41,7 @@ export class LLMController { this.logger.log(`applyStrategy: ${body}`); const { url } = body; - const doc = await this.weblinkService.parseWebLinkContent(url); // 处理错误边界 + const doc = await this.weblinkService.readWebLinkContent(url); // 处理错误边界 const res = await this.llmService.applyStrategy(doc); return res; @@ -53,7 +53,7 @@ export class LLMController { this.logger.log(`extractContentMeta: ${body}`); const { url } = body; - const doc = await this.weblinkService.parseWebLinkContent(url); // 处理错误边界 + const doc = await this.weblinkService.readWebLinkContent(url); // 处理错误边界 const res = await this.llmService.extractContentMeta(doc); return res; @@ -70,7 +70,7 @@ export class LLMController { const { urls } = body; const contentList = await Promise.all( urls.map(async (item) => { - const doc = await this.weblinkService.parseWebLinkContent(item); // 处理错误边界 + const doc = await this.weblinkService.readWebLinkContent(item); // 处理错误边界 // TODO: 这里需要结合 meta + content 来进行多个网页的总结 const contentMeta = await this.llmService.extractContentMeta(doc); diff --git a/reflyd/src/llm/llm.module.ts b/reflyd/src/llm/llm.module.ts index 10d888b62..112e27b28 100644 --- a/reflyd/src/llm/llm.module.ts +++ b/reflyd/src/llm/llm.module.ts @@ -2,9 +2,10 @@ import { Module } from '@nestjs/common'; import { ConfigModule } from '@nestjs/config'; import { LlmService } from './llm.service'; import { CommonModule } from '../common/common.module'; +import { RAGModule } from '../rag/rag.module'; @Module({ - imports: [ConfigModule, CommonModule], + imports: [ConfigModule, CommonModule, RAGModule], providers: [LlmService], exports: [LlmService], }) diff --git a/reflyd/src/llm/llm.service.spec.ts b/reflyd/src/llm/llm.service.spec.ts index 4683ac1e5..0f14c257b 100644 --- a/reflyd/src/llm/llm.service.spec.ts +++ b/reflyd/src/llm/llm.service.spec.ts @@ -26,7 +26,7 @@ describe('LlmService', () => { it('extractContentMeta', async () => { const url = 'https://paulgraham.com/vcsqueeze.html'; - const doc = await weblinkService.parseWebLinkContent(url); + const doc = await weblinkService.readWebLinkContent(url); const res = await service.extractContentMeta(doc); expect(res).toEqual({}); }); diff --git a/reflyd/src/llm/llm.service.ts b/reflyd/src/llm/llm.service.ts index 98039b558..299d84bb7 100644 --- a/reflyd/src/llm/llm.service.ts +++ b/reflyd/src/llm/llm.service.ts @@ -34,7 +34,9 @@ import { categoryList } from '../prompts/utils/category'; import { Source } from '../types/weblink'; import { SearchResultContext } from '../types/search'; import { PrismaService } from '../common/prisma.service'; -import { LOCALE } from 'src/types/task'; +import { LOCALE } from '../types/task'; +import { RAGService } from '../rag/rag.service'; +import { SearchResult } from '../common/weaviate.dto'; @Injectable() export class LlmService implements OnModuleInit { @@ -51,6 +53,7 @@ export class LlmService implements OnModuleInit { constructor( private prisma: PrismaService, private configService: ConfigService, + private ragService: RAGService, ) {} async onModuleInit() { @@ -462,19 +465,24 @@ export class LlmService implements OnModuleInit { }); } - async getRetrievalDocs(query: string, filter?: any) { - this.logger.log( - `activated with query: ${query}, filter: ${JSON.stringify(filter)}`, - ); + async getRetrievalDocs(uid: string, query: string, url?: string) { + this.logger.log(`uid: ${uid}, activated with query: ${query}, url: ${url}`); - const retrievalResults = await this.retrieval(query, filter); + const retrievalResults: SearchResult[] = await this.ragService.retrieve({ + tenantId: uid, + query, + filter: { url }, + }); - this.logger.log('retrievalResults', retrievalResults); + this.logger.log('retrievalResults: ' + JSON.stringify(retrievalResults)); const retrievedDocs = retrievalResults.map((res) => ({ - metadata: res?.metadata, - pageContent: res?.pageContent as string, - score: res?.score, // similarity score + metadata: { + url: res.url, + title: res.title, + }, + pageContent: res.content, + score: parseFloat(res._additional.score) || 0, })); return retrievedDocs; diff --git a/reflyd/src/rag/rag.module.ts b/reflyd/src/rag/rag.module.ts new file mode 100644 index 000000000..0bc591923 --- /dev/null +++ b/reflyd/src/rag/rag.module.ts @@ -0,0 +1,11 @@ +import { Module } from '@nestjs/common'; +import { ConfigModule } from '@nestjs/config'; +import { CommonModule } from '../common/common.module'; +import { RAGService } from './rag.service'; + +@Module({ + imports: [ConfigModule, CommonModule], + providers: [RAGService], + exports: [RAGService], +}) +export class RAGModule {} diff --git a/reflyd/src/rag/rag.service.spec.ts b/reflyd/src/rag/rag.service.spec.ts new file mode 100644 index 000000000..e5be4b104 --- /dev/null +++ b/reflyd/src/rag/rag.service.spec.ts @@ -0,0 +1,18 @@ +import { Test, TestingModule } from '@nestjs/testing'; +import { RAGService } from './rag.service'; + +describe('RAGService', () => { + let service: RAGService; + + beforeEach(async () => { + const module: TestingModule = await Test.createTestingModule({ + providers: [RAGService], + }).compile(); + + service = module.get(RAGService); + }); + + it('should be defined', () => { + expect(service).toBeDefined(); + }); +}); diff --git a/reflyd/src/rag/rag.service.ts b/reflyd/src/rag/rag.service.ts new file mode 100644 index 000000000..0c4dc1dfe --- /dev/null +++ b/reflyd/src/rag/rag.service.ts @@ -0,0 +1,109 @@ +import { Injectable } from '@nestjs/common'; +import { ConfigService } from '@nestjs/config'; +import avro from 'avsc'; +import { OpenAIEmbeddings } from '@langchain/openai'; +import { Document } from '@langchain/core/documents'; +import { TokenTextSplitter } from 'langchain/text_splitter'; +import { generateUuid5 } from 'weaviate-ts-client'; + +import { MinioService } from '../common/minio.service'; +import { PrismaService } from '../common/prisma.service'; +import { WeaviateService } from '../common/weaviate.service'; +import { + ContentDataObj, + ContentType, + HybridSearchParam, +} from '../common/weaviate.dto'; + +const READER_URL = 'https://r.jina.ai/'; + +export const ChunkAvroType = avro.Type.forSchema({ + type: 'record', + name: 'Chunk', + fields: [ + { name: 'id', type: 'string' }, + { name: 'url', type: 'string' }, + { name: 'type', type: 'string' }, + { name: 'title', type: 'string' }, + { name: 'content', type: 'string' }, + { name: 'vector', type: { type: 'array', items: 'float' } }, + ], +}); + +export const ContentAvroType = avro.Type.forSchema({ + type: 'record', + name: 'ContentChunks', + fields: [ + { + name: 'chunks', + type: { type: 'array', items: ChunkAvroType }, + }, + ], +}); + +export const PARSER_VERSION = '20240424'; + +@Injectable() +export class RAGService { + private embeddings: OpenAIEmbeddings; + private splitter: TokenTextSplitter; + + constructor( + private config: ConfigService, + private minio: MinioService, + private prisma: PrismaService, + private weaviate: WeaviateService, + ) { + this.embeddings = new OpenAIEmbeddings({ + modelName: 'text-embedding-3-large', + batchSize: 512, + dimensions: this.config.getOrThrow('vectorStore.vectorDim'), + timeout: 5000, + maxRetries: 3, + }); + this.splitter = new TokenTextSplitter({ + encodingName: 'cl100k_base', + chunkSize: 800, + chunkOverlap: 400, + }); + } + + async parseWebpage(url: string): Promise { + // TODO: error handling + const response = await fetch(READER_URL + url); + const text = await response.text(); + return { pageContent: text, metadata: {} }; + } + + async indexContent(param: { + url: string; + text?: string; + }): Promise { + const { url, text } = param; + + const chunks = await this.splitter.splitText(text); + const chunkEmbeds = await this.embeddings.embedDocuments(chunks); + + const dataObjs: ContentDataObj[] = []; + for (let i = 0; i < chunks.length; i++) { + dataObjs.push({ + id: generateUuid5(`${url}-${i}`), + url, + type: ContentType.weblink, + title: chunks[i], + content: chunks[i], + vector: chunkEmbeds[i], + }); + } + + return dataObjs; + } + + async saveDataForUser(uid: string, objList: ContentDataObj[]) { + await this.weaviate.batchSaveData(uid, objList); + } + + async retrieve(param: HybridSearchParam) { + return this.weaviate.hybridSearch(param); + } +} diff --git a/reflyd/src/utils/id.ts b/reflyd/src/utils/id.ts new file mode 100644 index 000000000..a3a679da9 --- /dev/null +++ b/reflyd/src/utils/id.ts @@ -0,0 +1,9 @@ +import { createId } from '@paralleldrive/cuid2'; + +export function genUID(): string { + return 'u-' + createId(); +} + +export function genLinkID(): string { + return 'l-' + createId(); +} diff --git a/reflyd/src/weblink/dto.ts b/reflyd/src/weblink/dto.ts index 3341384a4..72a444ed7 100644 --- a/reflyd/src/weblink/dto.ts +++ b/reflyd/src/weblink/dto.ts @@ -1,4 +1,5 @@ import { ApiProperty, ApiPropertyOptional } from '@nestjs/swagger'; +import { Document } from '@langchain/core/documents'; import { IndexStatus } from '@prisma/client'; export class WebLinkDTO { @@ -33,9 +34,11 @@ export class WebLinkDTO { pageContent?: string; // 反爬网站前端传入 @ApiPropertyOptional() - storageKey?: string; // 前端上传 html 拿到的 key + storageKey?: string; // 前端上传 html 拿到的 object key userId?: number; + + parsedDoc?: Document; // 服务端解析出的 Document } export class StoredWebLink extends WebLinkDTO { diff --git a/reflyd/src/weblink/weblink.controller.ts b/reflyd/src/weblink/weblink.controller.ts index 3ea266757..981c53503 100644 --- a/reflyd/src/weblink/weblink.controller.ts +++ b/reflyd/src/weblink/weblink.controller.ts @@ -38,7 +38,7 @@ export class WeblinkController { @Post('store') async store(@Request() req, @Body() body: StoreWebLinkParam) { this.logger.log(`user: ${req.user.id}, store link: ${body}`); - await this.weblinkService.storeLinks(req.user.id, body.data); + await this.weblinkService.storeLinks(req.user.uid, body.data); return { success: true }; } @@ -47,7 +47,7 @@ export class WeblinkController { async getWebContent(@Query('url') url) { this.logger.log(`getWebContent, ${url}`); - const parseContent = await this.weblinkService.parseWebLinkContent(url); // 处理错误边界 + const parseContent = await this.weblinkService.readWebLinkContent(url); // 处理错误边界 return parseContent; } diff --git a/reflyd/src/weblink/weblink.module.ts b/reflyd/src/weblink/weblink.module.ts index 64557de42..d5382f3db 100644 --- a/reflyd/src/weblink/weblink.module.ts +++ b/reflyd/src/weblink/weblink.module.ts @@ -8,6 +8,7 @@ import { WeblinkService } from './weblink.service'; import { WeblinkProcessor } from './weblink.processor'; import { CommonModule } from '../common/common.module'; import { AigcModule } from '../aigc/aigc.module'; +import { RAGModule } from '../rag/rag.module'; import { QUEUE_STORE_LINK } from '../utils/const'; @Module({ @@ -15,6 +16,7 @@ import { QUEUE_STORE_LINK } from '../utils/const'; ConfigModule, CommonModule, AigcModule, + RAGModule, BullModule.registerQueue({ name: QUEUE_STORE_LINK }), ], controllers: [WeblinkController], diff --git a/reflyd/src/weblink/weblink.service.ts b/reflyd/src/weblink/weblink.service.ts index d09ee686f..c3bc00c76 100644 --- a/reflyd/src/weblink/weblink.service.ts +++ b/reflyd/src/weblink/weblink.service.ts @@ -1,26 +1,31 @@ import { Injectable } from '@nestjs/common'; +import { ConfigService } from '@nestjs/config'; import { Prisma, Weblink } from '@prisma/client'; import { Queue } from 'bull'; import { LRUCache } from 'lru-cache'; import { InjectQueue } from '@nestjs/bull'; import * as cheerio from 'cheerio'; import { Document } from '@langchain/core/documents'; -import { CheerioWebBaseLoader } from 'langchain/document_loaders/web/cheerio'; import { LoggerService } from '../common/logger.service'; import { PrismaService } from '../common/prisma.service'; import { MinioService } from '../common/minio.service'; +import { + RAGService, + ContentAvroType, + PARSER_VERSION, +} from '../rag/rag.service'; import { AigcService } from '../aigc/aigc.service'; import { WebLinkDTO } from './dto'; import { getExpectedTokenLenContent } from '../utils/token'; -import { PageMeta, Source } from '../types/weblink'; +import { Source } from '../types/weblink'; import { QUEUE_STORE_LINK } from '../utils/const'; -import { ConfigService } from '@nestjs/config'; import { streamToString } from '../utils/stream'; +import { genLinkID } from '../utils/id'; @Injectable() export class WeblinkService { - private cache: LRUCache; // url -> document + private cache: LRUCache; // url -> parsed document (in markdown) private bucketName: string; constructor( @@ -28,6 +33,7 @@ export class WeblinkService { private prisma: PrismaService, private minio: MinioService, private configService: ConfigService, + private ragService: RAGService, private aigcService: AigcService, @InjectQueue(QUEUE_STORE_LINK) private indexQueue: Queue, ) { @@ -78,19 +84,6 @@ export class WeblinkService { } } - async createNewLink(link: WebLinkDTO, pageMeta: PageMeta) { - return this.prisma.weblink.create({ - data: { - url: link.url, - indexStatus: 'processing', - pageContent: '', // deprecated, always empty - storageKey: link.storageKey, - pageMeta: JSON.stringify(pageMeta), - contentMeta: '', - }, - }); - } - async getUserHistory(params: { skip?: number; take?: number; @@ -107,65 +100,39 @@ export class WeblinkService { * @param {string} url - The URL of the webpage to parse * @return {Promise} A Promise that resolves to the parsed document */ - async parseWebLinkContent(url: string): Promise { + async readWebLinkContent(url: string): Promise { // Check if the document is in the cache if (this.cache.has(url)) { this.logger.log(`in-mem cache hit: ${url}`); - return JSON.parse(this.cache.get(url)); + return this.cache.get(url); } // Check if the document is in the database const weblink = await this.prisma.weblink.findUnique({ - select: { storageKey: true, pageMeta: true }, + select: { parsedDocStorageKey: true, pageMeta: true }, where: { url }, }); if (weblink) { this.logger.log(`found weblink in db: ${url}`); const content = await this.minio.getObject( this.bucketName, - weblink.storageKey, + weblink.parsedDocStorageKey, ); const doc = new Document({ pageContent: await streamToString(content), metadata: JSON.parse(weblink.pageMeta), }); - this.cache.set(url, JSON.stringify(doc)); + this.cache.set(url, doc); return doc; } // Finally tries to fetch the content from the web - try { - const loader = new CheerioWebBaseLoader(url, { - maxRetries: 3, - timeout: 5000, - }); + const doc = await this.ragService.parseWebpage(url); - // customized webpage loading - // TODO: remove this in the future - const $ = await loader.scrape(); - // remove all styles and scripts tag - $('script, style, plasmo-csui, img, svg, meta, link').remove(); - // remove comments blocks - $('body') - .contents() - .each((i, node) => { - if (node.type === 'comment') { - $(node).remove(); - } - }); + this.cache.set(url, doc); - // only get meaning content - const pageContent = $.html(); - const title = $('title').text(); - const source = loader.webPath; - const doc = { pageContent, metadata: { title, source } }; - this.cache.set(url, JSON.stringify(doc)); - return doc; - } catch (err) { - this.logger.error(`process url ${url} failed: ${err}`); - return null; - } + return doc; } /** @@ -173,7 +140,7 @@ export class WeblinkService { * @param pageContent raw html page content * @returns nothing */ - async downloadWebLinkContent( + async directParseWebLinkContent( url: string, storageKey: string, ): Promise { @@ -198,7 +165,7 @@ export class WeblinkService { * @param weblinkList input weblinks * @returns langchain documents */ - async parseMultiWeblinks(weblinkList: Source[]): Promise { + async readMultiWeblinks(weblinkList: Source[]): Promise { // 处理 token 窗口,一共给 6K 窗口用于问答,平均分到每个网页,保障可用性 const avgTokenLen = 6000 / weblinkList?.length; @@ -212,7 +179,7 @@ export class WeblinkService { })); } - const { pageContent, metadata } = await this.parseWebLinkContent( + const { pageContent, metadata } = await this.readWebLinkContent( item.metadata?.source, ); return [ @@ -308,11 +275,64 @@ export class WeblinkService { totalReadTime: { increment: link.readTime || 0 }, }, }); + this.logger.log(`process link for user finish`); return uwb; } + async createNewWeblink(link: WebLinkDTO): Promise { + // Fetch doc and store in cache for later use + const doc = link.storageKey + ? await this.directParseWebLinkContent(link.url, link.storageKey) + : await this.readWebLinkContent(link.url); + this.cache.set(link.url, doc); + + // Upload parsed doc to minio + // TODO: sha256 of link url + const parsedDocStorageKey = `docs/${link.url}.md`; + const res = await this.minio.putObject( + this.configService.get('minio.weblinkBucket'), + parsedDocStorageKey, + doc.pageContent, + ); + this.logger.log('upload parsed doc to minio res: ' + JSON.stringify(res)); + + return this.prisma.weblink.create({ + data: { + url: link.url, + linkId: genLinkID(), + indexStatus: 'processing', + pageContent: '', // deprecated, always empty + storageKey: link.storageKey, + parsedDocStorageKey, + pageMeta: JSON.stringify({ title: link.title, source: link.url }), + contentMeta: '', + }, + }); + } + + async indexWeblink(weblink: Weblink, doc: Document) { + const dataObjs = await this.ragService.indexContent({ + url: weblink.url, + text: doc.pageContent, + }); + + const buf = ContentAvroType.toBuffer({ chunks: dataObjs }); + const chunkStorageKey = `content-${PARSER_VERSION}.avro`; + const res = await this.minio.putObject( + this.configService.get('minio.weblinkBucket'), + chunkStorageKey, + buf, + ); + this.logger.log('upload chunks to minio res: ' + JSON.stringify(res)); + + return this.prisma.weblink.update({ + where: { id: weblink.id }, + data: { chunkStorageKey }, + }); + } + async processLinkFromStoreQueue(link: WebLinkDTO) { this.logger.log(`process link from queue: ${JSON.stringify(link)}`); @@ -325,33 +345,17 @@ export class WeblinkService { // Link not found if (!weblink) { - if (!link.storageKey) { - return this.logger.warn( - `storageKey not provided for ${link.url}, skip`, - ); - } - - weblink = await this.createNewLink(link, { - title: link.title, - source: link.url, - }); + weblink = await this.createNewWeblink(link); } - // Fetch doc and store in cache for later use - const doc = link.storageKey - ? await this.downloadWebLinkContent(link.url, link.storageKey) - : await this.parseWebLinkContent(link.url); - - this.cache.set(link.url, JSON.stringify(doc)); + const doc = await this.readWebLinkContent(link.url); - // TODO: 优化 page content 的清洗逻辑 - const $ = cheerio.load(doc.pageContent); - doc.pageContent = $.text(); + await this.indexWeblink(weblink, doc); // 处理单个用户的访问记录 const uwb = await this.processLinkForUser(link, weblink); - await this.aigcService.runContentFlow({ doc, link, uwb, weblink }); + await this.aigcService.runContentFlow({ doc, uwb, weblink }); } } diff --git a/reflyd/yarn.lock b/reflyd/yarn.lock index c62f2caa6..2c2060169 100644 --- a/reflyd/yarn.lock +++ b/reflyd/yarn.lock @@ -400,6 +400,11 @@ resolved "https://registry.yarnpkg.com/@eslint/js/-/js-8.56.0.tgz#ef20350fec605a7f7035a01764731b2de0f3782b" integrity sha512-gMsVel9D7f2HLkBma9VbtzZRehRogVRfbr++f06nL2vnCGCNlzOD+/MUov/F4p8myyAHspEhVobgjpX64q5m6A== +"@graphql-typed-document-node/core@^3.1.1": + version "3.2.0" + resolved "https://registry.yarnpkg.com/@graphql-typed-document-node/core/-/core-3.2.0.tgz#5f3d96ec6b2354ad6d8a28bf216a1d97b5426861" + integrity sha512-mB9oAsNCm9aM3/SOv4YtBMqZbYj10R7dkq8byBqxGY/ncFwhf2oQzMV+LCRlWoDSEBJ3COiR1yeDvMtsoOsuFQ== + "@grpc/grpc-js@^1.7.1": version "1.10.6" resolved "https://registry.yarnpkg.com/@grpc/grpc-js/-/grpc-js-1.10.6.tgz#1e3eb1af911dc888fbef7452f56a7573b8284d54" @@ -1009,6 +1014,11 @@ dependencies: tslib "2.5.3" +"@noble/hashes@^1.1.5": + version "1.4.0" + resolved "https://registry.yarnpkg.com/@noble/hashes/-/hashes-1.4.0.tgz#45814aa329f30e4fe0ba49426f49dfccdd066426" + integrity sha512-V1JJ1WTRUqHHrOSh597hURcMqVKVGL/ea3kv0gSnEdsEZ0/+VyPghM1lMNGc00z7CIQorSvbKpuJkxvuHbvdbg== + "@nodelib/fs.scandir@2.1.5": version "2.1.5" resolved "https://registry.yarnpkg.com/@nodelib/fs.scandir/-/fs.scandir-2.1.5.tgz#7619c2eb21b25483f6d167548b4cfd5a7488c3d5" @@ -1724,6 +1734,13 @@ dependencies: "@opentelemetry/core" "^1.1.0" +"@paralleldrive/cuid2@^2.2.2": + version "2.2.2" + resolved "https://registry.yarnpkg.com/@paralleldrive/cuid2/-/cuid2-2.2.2.tgz#7f91364d53b89e2c9cb9e02e8dd0f129e834455f" + integrity sha512-ZOBkgDwEdoYVlSeRbYYXs0S9MejQofiVYoTbKzy/6GQa39/q5tQU2IX46+shYnUkpEl3wc+J6wRlar7r2EK2xA== + dependencies: + "@noble/hashes" "^1.1.5" + "@prisma/client@5": version "5.12.1" resolved "https://registry.yarnpkg.com/@prisma/client/-/client-5.12.1.tgz#c26a674fea76754b3a9e8b90a11e617f90212f76" @@ -2916,6 +2933,11 @@ available-typed-arrays@^1.0.7: dependencies: possible-typed-array-names "^1.0.0" +avsc@^5.7.7: + version "5.7.7" + resolved "https://registry.yarnpkg.com/avsc/-/avsc-5.7.7.tgz#8d1b5fd85904cc96a1e439450633ff33f4aff57b" + integrity sha512-9cYNccliXZDByFsFliVwk5GvTq058Fj513CiR4E60ndDwmuXzTJEp/Bp8FyuRmGyYupLjHLs+JA9/CBoVS4/NQ== + babel-jest@^29.7.0: version "29.7.0" resolved "https://registry.yarnpkg.com/babel-jest/-/babel-jest-29.7.0.tgz#f4369919225b684c56085998ac63dbd05be020d5" @@ -3553,6 +3575,13 @@ cron-parser@^4.2.1: dependencies: luxon "^3.2.1" +cross-fetch@^3.1.5: + version "3.1.8" + resolved "https://registry.yarnpkg.com/cross-fetch/-/cross-fetch-3.1.8.tgz#0327eba65fd68a7d119f8fb2bf9334a1a7956f82" + integrity sha512-cvA+JwZoU0Xq+h6WkMvAUqPEYy92Obet6UdKLfW60qn99ftItKjB5T+BkyWOFWe2pUyfQ+IJHmpOTznqk1M6Kg== + dependencies: + node-fetch "^2.6.12" + cross-spawn@^7.0.0, cross-spawn@^7.0.2, cross-spawn@^7.0.3: version "7.0.3" resolved "https://registry.yarnpkg.com/cross-spawn/-/cross-spawn-7.0.3.tgz#f73a85b9d5d41d045551c177e2882d4ac85728a6" @@ -4113,6 +4142,11 @@ external-editor@^3.0.3: iconv-lite "^0.4.24" tmp "^0.0.33" +extract-files@^9.0.0: + version "9.0.0" + resolved "https://registry.yarnpkg.com/extract-files/-/extract-files-9.0.0.tgz#8a7744f2437f81f5ed3250ed9f1550de902fe54a" + integrity sha512-CvdFfHkC95B4bBBk36hcEmvdR2awOdhhVUYH6S/zrVj3477zven/fJMYg7121h4T1xHZC+tetUpubpAhxwI7hQ== + fast-copy@^3.0.0: version "3.0.2" resolved "https://registry.yarnpkg.com/fast-copy/-/fast-copy-3.0.2.tgz#59c68f59ccbcac82050ba992e0d5c389097c9d35" @@ -4284,6 +4318,15 @@ form-data-encoder@1.7.2: resolved "https://registry.yarnpkg.com/form-data-encoder/-/form-data-encoder-1.7.2.tgz#1f1ae3dccf58ed4690b86d87e4f57c654fbab040" integrity sha512-qfqtYan3rxrnCk1VYaA4H+Ms9xdpPqvLZa6xmMgFvhO32x7/3J/ExcTd6qpxM0vH2GdMI+poehyBZvqfMTto8A== +form-data@^3.0.0: + version "3.0.1" + resolved "https://registry.yarnpkg.com/form-data/-/form-data-3.0.1.tgz#ebd53791b78356a99af9a300d4282c4d5eb9755f" + integrity sha512-RHkBKtLWUVwd7SqRIvCZMEvAMoGUp0XU+seQiZejj0COz3RI3hWP4sCv3gZWWLjJTd7rGwcsF5eKZGii0r/hbg== + dependencies: + asynckit "^0.4.0" + combined-stream "^1.0.8" + mime-types "^2.1.12" + form-data@^4.0.0: version "4.0.0" resolved "https://registry.yarnpkg.com/form-data/-/form-data-4.0.0.tgz#93919daeaf361ee529584b9b31664dc12c9fa452" @@ -4526,6 +4569,21 @@ graphemer@^1.4.0: resolved "https://registry.yarnpkg.com/graphemer/-/graphemer-1.4.0.tgz#fb2f1d55e0e3a1849aeffc90c4fa0dd53a0e66c6" integrity sha512-EtKwoO6kxCL9WO5xipiHTZlSzBm7WLT627TqC/uVRd0HKmq8NXyebnNYxDoBi7wt8eTWrUrKXCOVaFq9x1kgag== +graphql-request@^5.2.0: + version "5.2.0" + resolved "https://registry.yarnpkg.com/graphql-request/-/graphql-request-5.2.0.tgz#a05fb54a517d91bb2d7aefa17ade4523dc5ebdca" + integrity sha512-pLhKIvnMyBERL0dtFI3medKqWOz/RhHdcgbZ+hMMIb32mEPa5MJSzS4AuXxfI4sRAu6JVVk5tvXuGfCWl9JYWQ== + dependencies: + "@graphql-typed-document-node/core" "^3.1.1" + cross-fetch "^3.1.5" + extract-files "^9.0.0" + form-data "^3.0.0" + +graphql@^16.8.1: + version "16.8.1" + resolved "https://registry.yarnpkg.com/graphql/-/graphql-16.8.1.tgz#1930a965bef1170603702acdb68aedd3f3cf6f07" + integrity sha512-59LZHPdGZVh695Ud9lRzPBVTtlX9ZCV150Er2W43ro37wVof0ctenSaskPPjN7lVTIN8mSZt8PHUNKZuNQUuxw== + grpc-tools@^1.12.4: version "1.12.4" resolved "https://registry.yarnpkg.com/grpc-tools/-/grpc-tools-1.12.4.tgz#a044c9e8157941033ea7a5f144c2dc9dc4501de4" @@ -6029,7 +6087,7 @@ node-emoji@1.11.0: dependencies: lodash "^4.17.21" -node-fetch@^2.0.0, node-fetch@^2.6.1, node-fetch@^2.6.7, node-fetch@^2.6.9: +node-fetch@^2.0.0, node-fetch@^2.6.1, node-fetch@^2.6.12, node-fetch@^2.6.7, node-fetch@^2.6.9: version "2.7.0" resolved "https://registry.yarnpkg.com/node-fetch/-/node-fetch-2.7.0.tgz#d0f0fa6e3e2dc1d27efcd8ad99d550bda94d187d" integrity sha512-c4FRfUm/dbcWZ7U+1Wq0AwCyFL+3nt2bEw05wfxSz+DWpWsitgmSgYmy2dQdWyKC1694ELPqMs/YzUSNozLt8A== @@ -7720,6 +7778,14 @@ wcwidth@^1.0.1: dependencies: defaults "^1.0.3" +weaviate-ts-client@^2.1.1: + version "2.1.1" + resolved "https://registry.yarnpkg.com/weaviate-ts-client/-/weaviate-ts-client-2.1.1.tgz#5bf142f928b59be6cf74a5f388fbe03db11e6abc" + integrity sha512-d8yc2KnIEIV1beHAU8mhrElT3BoROoXGDsLlqFX8QGx3G+gOiPTRMc7SLy4F17+LvaUaTD0XkHvWX++4iehnsg== + dependencies: + graphql-request "^5.2.0" + uuid "^9.0.1" + web-encoding@^1.1.5: version "1.1.5" resolved "https://registry.yarnpkg.com/web-encoding/-/web-encoding-1.1.5.tgz#fc810cf7667364a6335c939913f5051d3e0c4864"