diff --git a/.gitignore b/.gitignore
index f83b6c757..4242f3a93 100644
--- a/.gitignore
+++ b/.gitignore
@@ -4,6 +4,4 @@ __pycache__
 .yarn
 .env*
 **/node_modules/**
-.DS_Store
-data
-qdrant_storage/**
\ No newline at end of file
+.DS_Store
\ No newline at end of file
diff --git a/aero/.gitignore b/aero/.gitignore
index e1979a048..ad4a1f17f 100644
--- a/aero/.gitignore
+++ b/aero/.gitignore
@@ -174,8 +174,3 @@ poetry.toml
 pyrightconfig.json
 
 # End of https://www.toptal.com/developers/gitignore/api/python
-唤起 Refly
-⌘B
-关闭
-Refly
-Refly
\ No newline at end of file
diff --git a/aero/data/mp.weixin.json b/aero/data/mp.weixin.json
new file mode 100644
index 000000000..c5338b1b5
--- /dev/null
+++ b/aero/data/mp.weixin.json
@@ -0,0 +1,30 @@
+[
+  {
+    "url": "https://mp.weixin.qq.com/s/s7H_0nzCw57-FY0yUnCXjQ",
+    "title": "RAG 修炼手册｜一文讲透 RAG 背后的技术"
+  },
+  {
+    "url": "https://mp.weixin.qq.com/s/5mcafRElVm3-i9SqmqUoCw",
+    "title": "AI 硬件万字长文：一直游到海水变蓝"
+  },
+  {
+    "url": "https://mp.weixin.qq.com/s/zL-Ro2JU3F_-yhu9RlSjlw",
+    "title": "Z Product | 华人之光！PyTorch之母的复旦女生开创模型平台，Benchmark、红杉等顶流投资"
+  },
+  {
+    "url": "https://mp.weixin.qq.com/s/UO8hSKiQxS1j3YirRSraPQ",
+    "title": "无限追问？AI交互式搜索工具Flowith，内置MJ、GPT、SD等多种产品，功能齐全！"
+  },
+  {
+    "url": "https://mp.weixin.qq.com/s/FbHTyHqEBJT-1PhA5x7FRg",
+    "title": "Linux之父讽刺AI炒作：很搞笑，大概我也会被大模型取代"
+  },
+  {
+    "url": "https://mp.weixin.qq.com/s/e2n4ttcT8raDU877t53GPQ",
+    "title": "Llama 3超大杯有何惊喜？Meta会一直开源吗？当初为何笃信元宇宙？扎克伯格新访谈回应一切"
+  },
+  {
+    "url": "https://mp.weixin.qq.com/s/ixGAGRp9cdcPza45vORQ6w",
+    "title": "再见！波士顿动力人形机器人Atlas​"
+  }
+]
diff --git a/aero/data/unstructured-io.github.io.json b/aero/data/unstructured-io.github.io.json
new file mode 100644
index 000000000..aad8bf22e
--- /dev/null
+++ b/aero/data/unstructured-io.github.io.json
@@ -0,0 +1,474 @@
+[
+  {
+    "url": "https://unstructured-io.github.io/unstructured/introduction.html",
+    "title": "Introduction - Unstructured 0.13.0 documentation"
+  },
+  {
+    "url": "https://unstructured-io.github.io/unstructured/best_practices/table_extraction_pdf.html",
+    "title": "Table Extraction from PDF - Unstructured 0.13.0 documentation"
+  },
+  {
+    "url": "https://unstructured-io.github.io/unstructured/best_practices/strategies.html",
+    "title": "Strategies - Unstructured 0.13.0 documentation"
+  },
+  {
+    "url": "https://unstructured-io.github.io/unstructured/best_practices.html",
+    "title": "Best Practices - Unstructured 0.13.0 documentation"
+  },
+  {
+    "url": "https://unstructured-io.github.io/unstructured/examples/chroma.html",
+    "title": "Data Processing into Vector Database - Unstructured 0.13.0 documentation"
+  },
+  {
+    "url": "https://unstructured-io.github.io/unstructured/examples/dict_to_elements.html",
+    "title": "Multi-files API Processing - Unstructured 0.13.0 documentation"
+  },
+  {
+    "url": "https://unstructured-io.github.io/unstructured/examples/databricks.html",
+    "title": "Delta Table Source Connector - Unstructured 0.13.0 documentation"
+  },
+  {
+    "url": "https://unstructured-io.github.io/unstructured/integrations.html",
+    "title": "Integrations - Unstructured 0.13.0 documentation"
+  },
+  {
+    "url": "https://unstructured-io.github.io/unstructured/best_practices/models.html",
+    "title": "Models - Unstructured 0.13.0 documentation"
+  },
+  {
+    "url": "https://unstructured-io.github.io/unstructured/examples.html",
+    "title": "Examples - Unstructured 0.13.0 documentation"
+  },
+  {
+    "url": "https://unstructured-io.github.io/unstructured/metadata.html",
+    "title": "Metadata - Unstructured 0.13.0 documentation"
+  },
+  {
+    "url": "https://unstructured-io.github.io/unstructured/ingest/configs/fsspec_config.html",
+    "title": "Fsspec Configuration - Unstructured 0.13.0 documentation"
+  },
+  {
+    "url": "https://unstructured-io.github.io/unstructured/ingest/configs/embedding_config.html",
+    "title": "Embedding Configuration - Unstructured 0.13.0 documentation"
+  },
+  {
+    "url": "https://unstructured-io.github.io/unstructured/ingest/configs/chunking_config.html",
+    "title": "Chunking Configuration - Unstructured 0.13.0 documentation"
+  },
+  {
+    "url": "https://unstructured-io.github.io/unstructured/ingest/configs/retry_strategy_config.html",
+    "title": "Retry Strategy Configuration - Unstructured 0.13.0 documentation"
+  },
+  {
+    "url": "https://unstructured-io.github.io/unstructured/ingest/configs/permissions_config.html",
+    "title": "Permissions Configuration - Unstructured 0.13.0 documentation"
+  },
+  {
+    "url": "https://unstructured-io.github.io/unstructured/ingest/configs/processor_config.html",
+    "title": "Processor Configuration - Unstructured 0.13.0 documentation"
+  },
+  {
+    "url": "https://unstructured-io.github.io/unstructured/ingest/configs/partition_config.html",
+    "title": "Partition Configuration - Unstructured 0.13.0 documentation"
+  },
+  {
+    "url": "https://unstructured-io.github.io/unstructured/ingest/configs/read_config.html",
+    "title": "Read Configuration - Unstructured 0.13.0 documentation"
+  },
+  {
+    "url": "https://unstructured-io.github.io/unstructured/ingest/configs.html",
+    "title": "Ingest Configuration - Unstructured 0.13.0 documentation"
+  },
+  {
+    "url": "https://unstructured-io.github.io/unstructured/ingest/destination_connectors/weaviate.html",
+    "title": "Weaviate - Unstructured 0.13.0 documentation"
+  },
+  {
+    "url": "https://unstructured-io.github.io/unstructured/ingest/destination_connectors/vectara.html",
+    "title": "Vectara - Unstructured 0.13.0 documentation"
+  },
+  {
+    "url": "https://unstructured-io.github.io/unstructured/ingest/destination_connectors/s3.html",
+    "title": "S3 - Unstructured 0.13.0 documentation"
+  },
+  {
+    "url": "https://unstructured-io.github.io/unstructured/ingest/destination_connectors/sql.html",
+    "title": "SQL - Unstructured 0.13.0 documentation"
+  },
+  {
+    "url": "https://unstructured-io.github.io/unstructured/ingest/destination_connectors/opensearch.html",
+    "title": "OpenSearch - Unstructured 0.13.0 documentation"
+  },
+  {
+    "url": "https://unstructured-io.github.io/unstructured/ingest/destination_connectors/qdrant.html",
+    "title": "Qdrant - Unstructured 0.13.0 documentation"
+  },
+  {
+    "url": "https://unstructured-io.github.io/unstructured/ingest/destination_connectors/mongodb.html",
+    "title": "MongoDB - Unstructured 0.13.0 documentation"
+  },
+  {
+    "url": "https://unstructured-io.github.io/unstructured/ingest/destination_connectors/gcs.html",
+    "title": "Google Cloud Service - Unstructured 0.13.0 documentation"
+  },
+  {
+    "url": "https://unstructured-io.github.io/unstructured/ingest/destination_connectors/pinecone.html",
+    "title": "Pinecone - Unstructured 0.13.0 documentation"
+  },
+  {
+    "url": "https://unstructured-io.github.io/unstructured/ingest/destination_connectors/elasticsearch.html",
+    "title": "Elasticsearch - Unstructured 0.13.0 documentation"
+  },
+  {
+    "url": "https://unstructured-io.github.io/unstructured/ingest/destination_connectors/dropbox.html",
+    "title": "Dropbox - Unstructured 0.13.0 documentation"
+  },
+  {
+    "url": "https://unstructured-io.github.io/unstructured/ingest/destination_connectors/databricks_volumes.html",
+    "title": "Databricks Volumes - Unstructured 0.13.0 documentation"
+  },
+  {
+    "url": "https://unstructured-io.github.io/unstructured/ingest/destination_connectors/delta_table.html",
+    "title": "Delta Table - Unstructured 0.13.0 documentation"
+  },
+  {
+    "url": "https://unstructured-io.github.io/unstructured/ingest/destination_connectors/clarifai.html",
+    "title": "Clarifai - Unstructured 0.13.0 documentation"
+  },
+  {
+    "url": "https://unstructured-io.github.io/unstructured/ingest/destination_connectors/chroma.html",
+    "title": "Chroma - Unstructured 0.13.0 documentation"
+  },
+  {
+    "url": "https://unstructured-io.github.io/unstructured/ingest/destination_connectors/box.html",
+    "title": "Box - Unstructured 0.13.0 documentation"
+  },
+  {
+    "url": "https://unstructured-io.github.io/unstructured/ingest/destination_connectors/azure_cognitive_search.html",
+    "title": "Azure Cognitive Search - Unstructured 0.13.0 documentation"
+  },
+  {
+    "url": "https://unstructured-io.github.io/unstructured/ingest/destination_connectors/azure.html",
+    "title": "Azure - Unstructured 0.13.0 documentation"
+  },
+  {
+    "url": "https://unstructured-io.github.io/unstructured/ingest/source_connectors/slack.html",
+    "title": "Slack - Unstructured 0.13.0 documentation"
+  },
+  {
+    "url": "https://unstructured-io.github.io/unstructured/ingest/destination_connectors.html",
+    "title": "Destination Connectors - Unstructured 0.13.0 documentation"
+  },
+  {
+    "url": "https://unstructured-io.github.io/unstructured/ingest/source_connectors/wikipedia.html",
+    "title": "Wikipedia - Unstructured 0.13.0 documentation"
+  },
+  {
+    "url": "https://unstructured-io.github.io/unstructured/ingest/source_connectors/sharepoint.html",
+    "title": "Sharepoint - Unstructured 0.13.0 documentation"
+  },
+  {
+    "url": "https://unstructured-io.github.io/unstructured/ingest/destination_connectors/astra.html",
+    "title": "Astra - Unstructured 0.13.0 documentation"
+  },
+  {
+    "url": "https://unstructured-io.github.io/unstructured/ingest/source_connectors/sftp.html",
+    "title": "Sftp - Unstructured 0.13.0 documentation"
+  },
+  {
+    "url": "https://unstructured-io.github.io/unstructured/ingest/source_connectors/salesforce.html",
+    "title": "Salesforce - Unstructured 0.13.0 documentation"
+  },
+  {
+    "url": "https://unstructured-io.github.io/unstructured/ingest/source_connectors/s3.html",
+    "title": "S3 - Unstructured 0.13.0 documentation"
+  },
+  {
+    "url": "https://unstructured-io.github.io/unstructured/ingest/source_connectors/reddit.html",
+    "title": "Reddit - Unstructured 0.13.0 documentation"
+  },
+  {
+    "url": "https://unstructured-io.github.io/unstructured/ingest/source_connectors/outlook.html",
+    "title": "Outlook - Unstructured 0.13.0 documentation"
+  },
+  {
+    "url": "https://unstructured-io.github.io/unstructured/ingest/source_connectors/opensearch.html",
+    "title": "OpenSearch - Unstructured 0.13.0 documentation"
+  },
+  {
+    "url": "https://unstructured-io.github.io/unstructured/ingest/source_connectors/onedrive.html",
+    "title": "One Drive - Unstructured 0.13.0 documentation"
+  },
+  {
+    "url": "https://unstructured-io.github.io/unstructured/ingest/source_connectors/mongodb.html",
+    "title": "MongoDB - Unstructured 0.13.0 documentation"
+  },
+  {
+    "url": "https://unstructured-io.github.io/unstructured/ingest/source_connectors/notion.html",
+    "title": "Notion - Unstructured 0.13.0 documentation"
+  },
+  {
+    "url": "https://unstructured-io.github.io/unstructured/ingest/source_connectors/google_cloud_storage.html",
+    "title": "Google Cloud Storage - Unstructured 0.13.0 documentation"
+  },
+  {
+    "url": "https://unstructured-io.github.io/unstructured/ingest/source_connectors/github.html",
+    "title": "Github - Unstructured 0.13.0 documentation"
+  },
+  {
+    "url": "https://unstructured-io.github.io/unstructured/ingest/source_connectors/local_connector.html",
+    "title": "Local - Unstructured 0.13.0 documentation"
+  },
+  {
+    "url": "https://unstructured-io.github.io/unstructured/ingest/source_connectors/dropbox.html",
+    "title": "Dropbox - Unstructured 0.13.0 documentation"
+  },
+  {
+    "url": "https://unstructured-io.github.io/unstructured/ingest/source_connectors/google_drive.html",
+    "title": "Google Drive - Unstructured 0.13.0 documentation"
+  },
+  {
+    "url": "https://unstructured-io.github.io/unstructured/ingest/source_connectors/jira.html",
+    "title": "Jira - Unstructured 0.13.0 documentation"
+  },
+  {
+    "url": "https://unstructured-io.github.io/unstructured/ingest/source_connectors/elasticsearch.html",
+    "title": "Elasticsearch - Unstructured 0.13.0 documentation"
+  },
+  {
+    "url": "https://unstructured-io.github.io/unstructured/ingest/source_connectors/gitlab.html",
+    "title": "Gitlab - Unstructured 0.13.0 documentation"
+  },
+  {
+    "url": "https://unstructured-io.github.io/unstructured/ingest/source_connectors/discord.html",
+    "title": "Discord - Unstructured 0.13.0 documentation"
+  },
+  {
+    "url": "https://unstructured-io.github.io/unstructured/ingest/source_connectors/delta_table.html",
+    "title": "Delta Table - Unstructured 0.13.0 documentation"
+  },
+  {
+    "url": "https://unstructured-io.github.io/unstructured/ingest/source_connectors/confluence.html",
+    "title": "Confluence - Unstructured 0.13.0 documentation"
+  },
+  {
+    "url": "https://unstructured-io.github.io/unstructured/ingest/source_connectors/biomed.html",
+    "title": "Biomed - Unstructured 0.13.0 documentation"
+  },
+  {
+    "url": "https://unstructured-io.github.io/unstructured/ingest/source_connectors/box.html",
+    "title": "Box - Unstructured 0.13.0 documentation"
+  },
+  {
+    "url": "https://unstructured-io.github.io/unstructured/ingest/source_connectors/azure.html",
+    "title": "Azure - Unstructured 0.13.0 documentation"
+  },
+  {
+    "url": "https://unstructured-io.github.io/unstructured/ingest/source_connectors/airtable.html",
+    "title": "Airtable - Unstructured 0.13.0 documentation"
+  },
+  {
+    "url": "https://unstructured-io.github.io/unstructured/ingest/source_connectors.html",
+    "title": "Source Connectors - Unstructured 0.13.0 documentation"
+  },
+  {
+    "url": "https://unstructured-io.github.io/unstructured/core/embedding.html",
+    "title": "Embedding - Unstructured 0.13.0 documentation"
+  },
+  {
+    "url": "https://unstructured-io.github.io/unstructured/ingest/index.html",
+    "title": "Ingest - Unstructured 0.13.0 documentation"
+  },
+  {
+    "url": "https://unstructured-io.github.io/unstructured/core/chunking.html",
+    "title": "Chunking - Unstructured 0.13.0 documentation"
+  },
+  {
+    "url": "https://unstructured-io.github.io/unstructured/core/extracting.html",
+    "title": "Extracting - Unstructured 0.13.0 documentation"
+  },
+  {
+    "url": "https://unstructured-io.github.io/unstructured/core/staging.html",
+    "title": "Staging - Unstructured 0.13.0 documentation"
+  },
+  {
+    "url": "https://unstructured-io.github.io/unstructured/core/partition.html",
+    "title": "Partitioning - Unstructured 0.13.0 documentation"
+  },
+  {
+    "url": "https://unstructured-io.github.io/unstructured/platforms/platform_destinations/weaviate.html",
+    "title": "Weaviate - Unstructured 0.13.0 documentation"
+  },
+  {
+    "url": "https://unstructured-io.github.io/unstructured/core/cleaning.html",
+    "title": "Cleaning - Unstructured 0.13.0 documentation"
+  },
+  {
+    "url": "https://unstructured-io.github.io/unstructured/core.html",
+    "title": "Core Functionality - Unstructured 0.13.0 documentation"
+  },
+  {
+    "url": "https://unstructured-io.github.io/unstructured/platforms/platform_destinations/postgresql.html",
+    "title": "PostgreSQL - Unstructured 0.13.0 documentation"
+  },
+  {
+    "url": "https://unstructured-io.github.io/unstructured/platforms/platform_destinations/pinecone.html",
+    "title": "Pinecone - Unstructured 0.13.0 documentation"
+  },
+  {
+    "url": "https://unstructured-io.github.io/unstructured/platforms/platform_destinations/opensearch.html",
+    "title": "OpenSearch - Unstructured 0.13.0 documentation"
+  },
+  {
+    "url": "https://unstructured-io.github.io/unstructured/platforms/platform_destinations/mongodb.html",
+    "title": "MongoDB - Unstructured 0.13.0 documentation"
+  },
+  {
+    "url": "https://unstructured-io.github.io/unstructured/platforms/platform_destinations/google_cloud_destination.html",
+    "title": "Google Cloud Storage - Unstructured 0.13.0 documentation"
+  },
+  {
+    "url": "https://unstructured-io.github.io/unstructured/platforms/platform_destinations/elasticsearch_destination.html",
+    "title": "Elasticsearch - Unstructured 0.13.0 documentation"
+  },
+  {
+    "url": "https://unstructured-io.github.io/unstructured/platforms/platform_destinations/databricks.html",
+    "title": "Databricks - Unstructured 0.13.0 documentation"
+  },
+  {
+    "url": "https://unstructured-io.github.io/unstructured/platforms/platform_destinations/azure_cognitive_search.html",
+    "title": "Azure Cognitive Search - Unstructured 0.13.0 documentation"
+  },
+  {
+    "url": "https://unstructured-io.github.io/unstructured/platforms/platform_destinations/chroma.html",
+    "title": "Chroma - Unstructured 0.13.0 documentation"
+  },
+  {
+    "url": "https://unstructured-io.github.io/unstructured/platforms/platform_sources/sftp.html",
+    "title": "SFTP Storage - Unstructured 0.13.0 documentation"
+  },
+  {
+    "url": "https://unstructured-io.github.io/unstructured/platforms/destination_platform.html",
+    "title": "Platform Destination Connectors - Unstructured 0.13.0 documentation"
+  },
+  {
+    "url": "https://unstructured-io.github.io/unstructured/platforms/platform_destinations/amazon_s3_destination.html",
+    "title": "Amazon S3 - Unstructured 0.13.0 documentation"
+  },
+  {
+    "url": "https://unstructured-io.github.io/unstructured/platforms/platform_sources/salesforce.html",
+    "title": "Salesforce - Unstructured 0.13.0 documentation"
+  },
+  {
+    "url": "https://unstructured-io.github.io/unstructured/platforms/platform_sources/sharepoint.html",
+    "title": "Sharepoint - Unstructured 0.13.0 documentation"
+  },
+  {
+    "url": "https://unstructured-io.github.io/unstructured/platforms/platform_sources/opensearch.html",
+    "title": "OpenSearch - Unstructured 0.13.0 documentation"
+  },
+  {
+    "url": "https://unstructured-io.github.io/unstructured/platforms/platform_sources/onedrive.html",
+    "title": "OneDrive Cloud Storage - Unstructured 0.13.0 documentation"
+  },
+  {
+    "url": "https://unstructured-io.github.io/unstructured/platforms/platform_sources/google_cloud_source.html",
+    "title": "Google Cloud Storage - Unstructured 0.13.0 documentation"
+  },
+  {
+    "url": "https://unstructured-io.github.io/unstructured/platforms/platform_sources/google_drive.html",
+    "title": "Google Drive - Unstructured 0.13.0 documentation"
+  },
+  {
+    "url": "https://unstructured-io.github.io/unstructured/platforms/platform_sources/azure_blob.html",
+    "title": "Azure Blob Storage - Unstructured 0.13.0 documentation"
+  },
+  {
+    "url": "https://unstructured-io.github.io/unstructured/platforms/platform_sources/amazon_s3_source.html",
+    "title": "Amazon S3 - Unstructured 0.13.0 documentation"
+  },
+  {
+    "url": "https://unstructured-io.github.io/unstructured/platforms/platform_sources/elasticsearch_source.html",
+    "title": "Elasticsearch - Unstructured 0.13.0 documentation"
+  },
+  {
+    "url": "https://unstructured-io.github.io/unstructured/platforms/source_platform.html",
+    "title": "Platform Source Connectors - Unstructured 0.13.0 documentation"
+  },
+  {
+    "url": "https://unstructured-io.github.io/unstructured/platforms/job.html",
+    "title": "Jobs Scheduling - Unstructured 0.13.0 documentation"
+  },
+  {
+    "url": "https://unstructured-io.github.io/unstructured/platforms/workflow.html",
+    "title": "Workflows Automation - Unstructured 0.13.0 documentation"
+  },
+  {
+    "url": "https://unstructured-io.github.io/unstructured/platform.html",
+    "title": "Unstructured Platform - Unstructured 0.13.0 documentation"
+  },
+  {
+    "url": "https://unstructured-io.github.io/unstructured/apis/validation_errors.html",
+    "title": "API Validation Errors - Unstructured 0.13.0 documentation"
+  },
+  {
+    "url": "https://unstructured-io.github.io/unstructured/apis/api_parameters.html",
+    "title": "API Parameters - Unstructured 0.13.0 documentation"
+  },
+  {
+    "url": "https://unstructured-io.github.io/unstructured/apis/usage_methods.html",
+    "title": "Accessing Unstructured API - Unstructured 0.13.0 documentation"
+  },
+  {
+    "url": "https://unstructured-io.github.io/unstructured/apis/api_sdks.html",
+    "title": "Python and JavaScript SDK - Unstructured 0.13.0 documentation"
+  },
+  {
+    "url": "https://unstructured-io.github.io/unstructured/apis/aws_marketplace.html",
+    "title": "AWS Marketplace Deployment Guide - Unstructured 0.13.0 documentation"
+  },
+  {
+    "url": "https://unstructured-io.github.io/unstructured/apis/azure_marketplace.html",
+    "title": "Azure Marketplace Deployment Guide - Unstructured 0.13.0 documentation"
+  },
+  {
+    "url": "https://unstructured-io.github.io/unstructured/api.html",
+    "title": "Unstructured API Services - Unstructured 0.13.0 documentation"
+  },
+  {
+    "url": "https://unstructured-io.github.io/unstructured/apis/saas_api.html",
+    "title": "SaaS API Deployment Guide - Unstructured 0.13.0 documentation"
+  },
+  {
+    "url": "https://unstructured-io.github.io/unstructured/introduction.html",
+    "title": "Introduction - Unstructured 0.13.0 documentation"
+  },
+  {
+    "url": "https://unstructured-io.github.io/unstructured/installing.html",
+    "title": "Installation - Unstructured 0.13.0 documentation"
+  },
+  {
+    "url": "https://unstructured-io.github.io/unstructured/installation/docker.html",
+    "title": "Docker Installation - Unstructured 0.13.0 documentation"
+  },
+  {
+    "url": "https://unstructured-io.github.io/unstructured/installation/full_installation.html",
+    "title": "Full Installation - Unstructured 0.13.0 documentation"
+  },
+  {
+    "url": "https://unstructured-io.github.io/unstructured/introduction/key_concepts.html",
+    "title": "Key Concepts - Unstructured 0.13.0 documentation"
+  },
+  {
+    "url": "https://unstructured-io.github.io/unstructured/introduction/overview.html",
+    "title": "Document Elements - Unstructured 0.13.0 documentation"
+  },
+  {
+    "url": "https://unstructured-io.github.io/unstructured/introduction/getting_started.html",
+    "title": "Quick Start - Unstructured 0.13.0 documentation"
+  },
+  {
+    "url": "https://unstructured-io.github.io/unstructured/index.html",
+    "title": "Unstructured 0.13.0 documentation"
+  }
+]
diff --git a/aero/main.py b/aero/main.py
index aa70eeba5..9a308b1d6 100644
--- a/aero/main.py
+++ b/aero/main.py
@@ -10,7 +10,7 @@
 _cleanup_coroutines = []
 
 
-class Greeter(aero_pb2_grpc.AeroServicer):
+class AeroService(aero_pb2_grpc.AeroServicer):
     async def ParseHTML(
         self,
         request: aero_pb2.ParseHTMLRequest,
@@ -32,7 +32,7 @@ async def ParseHTML(
 
 async def serve() -> None:
     server = grpc.aio.server()
-    aero_pb2_grpc.add_AeroServicer_to_server(Greeter(), server)
+    aero_pb2_grpc.add_AeroServicer_to_server(AeroService(), server)
     listen_addr = "[::]:50051"
     server.add_insecure_port(listen_addr)
     logging.info("Starting server on %s", listen_addr)
diff --git a/aero/md.py b/aero/md.py
new file mode 100644
index 000000000..85e60469f
--- /dev/null
+++ b/aero/md.py
@@ -0,0 +1,24 @@
+from unstructured.partition.md import partition_md
+from langchain_text_splitters import MarkdownHeaderTextSplitter
+
+
+# load documents
+headers_to_split_on = [
+    ("#", "Header 1"),
+    ("##", "Header 2"),
+    ("###", "Header 3"),
+]
+
+with open('./jina_data/intro.md') as fp:
+    markdown_document = fp.read()
+
+markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on, strip_headers=False)
+md_header_splits = markdown_splitter.split_text(markdown_document)
+
+for split in md_header_splits:
+    print(split.page_content)
+    print("~" * 180)
+
+# for chunk in chunks:
+#     print(chunk.text)
+#     print("-" * 80)
diff --git a/aero/prepare.py b/aero/prepare.py
new file mode 100644
index 000000000..3dadafb84
--- /dev/null
+++ b/aero/prepare.py
@@ -0,0 +1,21 @@
+from unstructured.partition.html import partition_html
+from unstructured.cleaners.core import clean
+from unstructured.chunking.title import chunk_by_title
+
+# url = "https://unstructured-io.github.io/unstructured/introduction.html"
+chunks = partition_html(url='https://refly.ai/', chunking_strategy='by_title', max_characters=1000, new_after_n_chars=800)
+# chunks = partition_html(
+#     filename="./html_files/https_unstructured-io.github.io_unstructured_introduction.html.html",
+# )
+
+chunks = [chunk for chunk in chunks if len(chunk.text.split()) > 10]
+
+chunks = chunk_by_title(chunks, max_characters=1200, new_after_n_chars=800)
+
+# chunks = partition_html(url=url, chunking_strategy='basic', max_characters=1000, new_after_n_chars=800, overlap=400)
+
+# print([clean(chunk.text, extra_whitespace=True) for chunk in chunks])
+for chunk in chunks:
+    # print(chunk.to_dict())
+    print(clean(chunk.text, extra_whitespace=True))
+    print("-" * 50)
diff --git a/aero/requirements.txt b/aero/requirements.txt
deleted file mode 100644
index 7f830cf7b..000000000
--- a/aero/requirements.txt
+++ /dev/null
@@ -1,5 +0,0 @@
-grpcio==1.62.1
-grpcio-tools==1.62.1
-lxml==5.2.1
-protobuf==4.25.3
-setuptools==69.5.1
diff --git a/aero/scripts/scrape.py b/aero/scripts/scrape.py
new file mode 100644
index 000000000..62517fe16
--- /dev/null
+++ b/aero/scripts/scrape.py
@@ -0,0 +1,27 @@
+import scrapy
+import json
+
+
+class ReflySpider(scrapy.Spider):
+    name = "refly_spider"
+    allowed_domains = ["unstructured-io.github.io"]
+    start_urls = ["https://unstructured-io.github.io/unstructured/introduction.html"]
+
+    def __init__(self, *args, **kwargs):
+        super(ReflySpider, self).__init__(*args, **kwargs)
+        self.data = []
+
+    def parse(self, response):
+        self.data.append(
+            {
+                "url": response.url,
+                "title": response.css("title::text").get(),
+            }
+        )
+
+        for link in response.css("a::attr(href)"):
+            yield response.follow(link.get(), callback=self.parse)
+
+    def close(self, reason):
+        with open(f"{self.allowed_domains[0]}.json", "w") as f:
+            json.dump(self.data, f, indent=2)
diff --git a/aero/split_by_token.py b/aero/split_by_token.py
new file mode 100644
index 000000000..199ea1a28
--- /dev/null
+++ b/aero/split_by_token.py
@@ -0,0 +1,13 @@
+with open("./jina_data/mp.md") as f:
+    text = f.read()
+
+from langchain_text_splitters import CharacterTextSplitter
+
+text_splitter = CharacterTextSplitter.from_tiktoken_encoder(
+    encoding_name="cl100k_base", chunk_size=800, chunk_overlap=400
+)
+texts = text_splitter.split_text(text)
+
+for text in texts:
+    print(text)
+    print("=" * 280)
\ No newline at end of file
diff --git a/aero/store_link.py b/aero/store_link.py
new file mode 100644
index 000000000..70af518a5
--- /dev/null
+++ b/aero/store_link.py
@@ -0,0 +1,143 @@
+import pickle
+import os
+import json
+import requests
+
+import weaviate
+import weaviate.classes as wvc
+from weaviate.classes.query import MetadataQuery, HybridFusion
+from weaviate.auth import AuthApiKey
+from weaviate.util import generate_uuid5
+from weaviate.classes.config import Configure, Property, DataType
+
+from langchain_text_splitters import CharacterTextSplitter
+
+COLLECTION_NAME = "Refly"
+READER_URL = "https://r.jina.ai/"
+
+client = weaviate.connect_to_wcs(
+    cluster_url=os.environ['WEAVIATE_CLUSTER_URL'],
+    auth_credentials=AuthApiKey(os.environ['WEAVIATE_API_KEY']),
+    skip_init_checks=True,
+    headers={
+        "X-OpenAI-Api-Key": os.getenv("OPENAI_API_KEY"),
+    }
+)
+
+
+def init_collection():
+    if client.collections.exists(COLLECTION_NAME):
+        return
+
+    client.collections.create(
+        COLLECTION_NAME,
+        properties=[
+            Property(name="url", data_type=DataType.TEXT),
+            Property(name="type", data_type=DataType.TEXT),
+            Property(
+                name="title",
+                data_type=DataType.TEXT,
+                index_filterable=True,
+                index_searchable=True,
+            ),
+            Property(
+                name="content",
+                data_type=DataType.TEXT,
+                index_filterable=True,
+                index_searchable=True,
+            ),
+        ],
+        vectorizer_config=[
+            Configure.NamedVectors.text2vec_openai(
+                name="content",
+                model="text-embedding-3-large",
+                dimensions=256,
+                source_properties=["content"],
+            )
+        ],
+    )
+
+
+def store_link(url: str, title: str):
+    resp = requests.get(READER_URL + url)
+    text_splitter = CharacterTextSplitter.from_tiktoken_encoder(
+        encoding_name="cl100k_base", chunk_size=800, chunk_overlap=400
+    )
+    texts = text_splitter.split_text(resp.text)
+    print(f'processing {url} with {len(texts)} chunks')
+
+    collection = client.collections.get("Refly")
+    data_objects = list()
+
+    for chunk in texts:
+        properties = {
+            "url": url,
+            "type": "weblink",
+            "title": title,
+            "content": chunk,
+        }
+        # print(properties)
+        data_objects.append(
+            wvc.data.DataObject(properties=properties, uuid=generate_uuid5(properties))
+        )
+
+    res = collection.data.insert_many(data_objects)
+    print(res)
+
+
+def load_html_from_local_pickle(fpath: str):
+    with open(fpath, "rb") as f:
+        data = pickle.load(f)
+
+    for item in data[:2]:
+        print(item)
+        store_link(item["url"], item["title"], item["html"])
+
+
+def load_html_from_local_json(fpath: str):
+    with open(fpath, "r") as f:
+        data = json.load(f)
+
+    for item in data:
+        store_link(item["url"], item["title"])
+
+
+def read_data():
+    collection = client.collections.get(COLLECTION_NAME)
+
+    for item in collection.iterator(include_vector=True):
+        print(item.uuid, len(item.vector['content']))
+
+
+def search_data(query: str):
+    collection = client.collections.get(COLLECTION_NAME)
+    response = collection.query.hybrid(
+        query=query,
+        limit=3,
+        return_metadata=MetadataQuery(score=True, explain_score=True),
+    )
+
+    for o in response.objects:
+        print(o.properties)
+        print(o.metadata.score, o.metadata.explain_score)
+
+
+def delete_data_objects():
+    collection = client.collections.get(COLLECTION_NAME)
+
+    for item in collection.iterator():
+        print('delete', item.uuid)
+        collection.data.delete_by_id(item.uuid)
+
+
+if __name__ == "__main__":
+    try:
+        init_collection()
+        # store_link('https://unstructured-io.github.io/unstructured/core/chunking.html', 'chunking')
+        # delete_data_objects()
+        # load_html_from_local_json('./data/mp.weixin.json')
+        search_data('Linus')
+        # load_html_from_local_pickle('./unstructured-io.github.io.pickle')
+        # read_data()
+    finally:
+        client.close()
diff --git a/reflyd/package.json b/reflyd/package.json
index 0efb4018a..49f526a66 100644
--- a/reflyd/package.json
+++ b/reflyd/package.json
@@ -42,11 +42,14 @@
     "@opentelemetry/auto-instrumentations-node": "^0.44.0",
     "@opentelemetry/exporter-trace-otlp-http": "^0.50.0",
     "@opentelemetry/sdk-node": "^0.50.0",
+    "@paralleldrive/cuid2": "^2.2.2",
     "@prisma/client": "5",
     "@prisma/instrumentation": "^5.12.1",
+    "avsc": "^5.7.7",
     "bull": "^4.12.2",
     "cheerio": "^1.0.0-rc.12",
     "cookie-parser": "~1.4.6",
+    "graphql": "^16.8.1",
     "helmet": "^7.1.0",
     "langchain": "^0.1.21",
     "lodash.omit": "^4.5.0",
@@ -66,6 +69,7 @@
     "redis": "^4.6.13",
     "reflect-metadata": "^0.1.13",
     "rxjs": "^7.2.0",
+    "weaviate-ts-client": "^2.1.1",
     "zod": "^3.22.4"
   },
   "devDependencies": {
diff --git a/reflyd/prisma/schema.prisma b/reflyd/prisma/schema.prisma
index 40b5ac1d4..1af586069 100644
--- a/reflyd/prisma/schema.prisma
+++ b/reflyd/prisma/schema.prisma
@@ -41,6 +41,8 @@ model VerificationToken {
 model User {
   /// 主键
   id            Int       @id @default(autoincrement())
+  /// UID
+  uid           String?   @default("") @map("uid")
   /// 头像
   avatar        String?   @map("avatar")
   /// 用户名
@@ -67,6 +69,8 @@ model User {
 model Topic {
   /// 主键
   id          Int      @id @default(autoincrement())
+  /// 主题 id
+  topicId     String?  @default("") @map("topic_id")
   /// 主题 key
   key         String   @unique @map("key")
   /// 多语言名称 JSON 字符串 (key: 语言, val: 名称)
@@ -107,8 +111,10 @@ model UserPreference {
 
 // 会话模型
 model Conversation {
-  /// id为主键
+  /// 主键
   id              Int      @id @default(autoincrement())
+  /// 对话id
+  convId          String?  @default("") @map("conv_id")
   /// 用户id
   userId          Int      @map("user_id")
   /// 内容 id，指向 aigc_content 中的内容
@@ -140,6 +146,8 @@ model Conversation {
 model ChatMessage {
   /// id为主键
   id                    Int         @id @default(autoincrement())
+  /// 消息id
+  msgId                 String?     @default("") @map("msg_id")
   /// 会话id
   conversationId        Int         @map("conversation_id")
   /// 消息来源
@@ -203,29 +211,59 @@ model UserWeblink {
 
 model Weblink {
   /// id为主键
-  id          Int         @id @default(autoincrement())
+  id                  Int         @id @default(autoincrement())
+  /// 网页id
+  linkId              String?     @default("") @map("link_id")
   /// 网页链接
-  url         String      @unique @map("url")
+  url                 String      @unique @map("url")
   /// 页面内容 (deprecated)
-  pageContent String      @map("page_content")
+  pageContent         String      @map("page_content")
   /// 对象存储 key
-  storageKey  String      @default("") @map("storage_key")
+  storageKey          String      @default("") @map("storage_key")
   /// 页面元数据, JSON 存储
-  pageMeta    String      @map("page_meta")
+  pageMeta            String      @map("page_meta")
   /// 内容元数据, JSON 存储
-  contentMeta String      @map("content_meta")
+  contentMeta         String      @map("content_meta")
   /// 索引状态
-  indexStatus IndexStatus @default(init) @map("index_status")
+  indexStatus         IndexStatus @default(init) @map("index_status")
+  /// 解析策略版本，格式统一为 YYYYMMDD
+  parserVersion       String?     @default("00000000") @map("parser_version")
+  /// 解析后内容存储 key
+  parsedDocStorageKey String?     @default("") @map("parsed_doc_storage_key")
+  /// 切块后数据存储 key
+  chunkStorageKey     String?     @default("") @map("chunk_storage_key")
+  /// 上次解析时间
+  lastParseTime       DateTime?   @default(now()) @map("last_parse_time") @db.Timestamptz()
   /// 创建时间
-  createdAt   DateTime    @default(now()) @map("created_at") @db.Timestamptz()
+  createdAt           DateTime    @default(now()) @map("created_at") @db.Timestamptz()
   /// 更新时间
-  updatedAt   DateTime    @updatedAt @map("updated_at") @db.Timestamptz()
+  updatedAt           DateTime    @updatedAt @map("updated_at") @db.Timestamptz()
 
   contents AigcContent[]
 
   @@map("weblinks")
 }
 
+model WeblinkLog {
+  /// id为主键
+  id            Int      @id @default(autoincrement())
+  /// 关联的链接 id
+  weblinkId     Int      @map("weblink_id")
+  /// 用户id
+  userId        Int      @map("user_id")
+  /// 解析策略版本
+  parserVersion String   @default("00000000") @map("parser_version")
+  /// 访问时间
+  visitTime     DateTime @default(now()) @map("visit_time") @db.Timestamptz()
+  /// 创建时间
+  createdAt     DateTime @default(now()) @map("created_at") @db.Timestamptz()
+  /// 更新时间
+  updatedAt     DateTime @updatedAt @map("updated_at") @db.Timestamptz()
+
+  @@index([weblinkId, visitTime])
+  @@map("weblink_log")
+}
+
 model WeblinkUserMark {
   /// id为主键
   id               Int      @id @default(autoincrement())
@@ -255,6 +293,8 @@ model WeblinkUserMark {
 model AigcContent {
   /// 主键
   id         Int               @id @default(autoincrement())
+  /// 内容id
+  cid        String?           @default("") @map("cid")
   /// 标题
   title      String            @map("title")
   /// 摘要
@@ -295,6 +335,8 @@ model AigcContent {
 model UserDigest {
   /// 主键
   id        Int         @id @default(autoincrement())
+  /// 摘要 id
+  digestId  String?     @default("") @map("digest_id")
   /// 用户 id
   userId    Int         @map("user_id")
   /// 日期 (YYYY-MM-DD)
diff --git a/reflyd/scripts/hybrid-search.ts b/reflyd/scripts/hybrid-search.ts
new file mode 100644
index 000000000..515fcfde7
--- /dev/null
+++ b/reflyd/scripts/hybrid-search.ts
@@ -0,0 +1,68 @@
+import { MilvusClient } from '@zilliz/milvus2-sdk-node';
+import { DataType } from '@zilliz/milvus2-sdk-node';
+
+const address = 'localhost:19530';
+const token = 'root:Milvus';
+const ssl = false;
+const milvusClient = new MilvusClient({ address, ssl, token });
+
+const params = {
+  collection_name: 'refly',
+  description: 'Refly Content Search',
+  fields: [
+    {
+      name: 'url',
+      description: 'weblink url',
+      data_type: DataType.VarChar,
+    },
+    {
+      name: 'title',
+      description: 'weblink title',
+      data_type: DataType.VarChar,
+      max_length: 500,
+    },
+    {
+      name: 'type',
+      description: 'content type',
+      data_type: DataType.VarChar,
+      max_length: 100,
+    },
+    {
+      name: 'chunk_id',
+      description: 'chunk id',
+      data_type: DataType.VarChar,
+      max_length: 100,
+    },
+    {
+      name: 'content',
+      description: 'chunked content',
+      data_type: DataType.VarChar,
+      max_length: 10000,
+    },
+    {
+      name: 'vector',
+      description: 'vector of chunked content',
+      data_type: DataType.FloatVector,
+      dim: 256,
+    },
+  ],
+  enableDynamicField: true,
+};
+
+async function main() {
+  const res = await milvusClient.createCollection(params);
+  console.log('create collections status:', res);
+
+  const loadRes = await milvusClient.loadCollection({
+    collection_name: params.collection_name,
+  });
+  console.log('load collections status:', loadRes);
+
+  await milvusClient.hybridSearch({
+    collection_name: 'refly',
+    anns_field: 'vector',
+    data: {},
+  });
+}
+
+main();
diff --git a/reflyd/src/aigc/aigc.service.ts b/reflyd/src/aigc/aigc.service.ts
index fab7ad9a5..020f59043 100644
--- a/reflyd/src/aigc/aigc.service.ts
+++ b/reflyd/src/aigc/aigc.service.ts
@@ -91,10 +91,7 @@ export class AigcService {
     return { ...content, inputs };
   }
 
-  private async updateUserPreferences(param: {
-    uwb: UserWeblink;
-    meta: ContentMeta;
-  }) {
+  async updateUserPreferences(param: { uwb: UserWeblink; meta: ContentMeta }) {
     const { uwb, meta } = param;
 
     // 对于每一个标注的 topic，更新用户喜好
@@ -126,7 +123,7 @@ export class AigcService {
    * @param param
    * @returns
    */
-  private async upsertUserDigest(param: {
+  async upsertUserDigest(param: {
     uwb: UserWeblink;
     content: AigcContent;
     meta: ContentMeta;
@@ -135,50 +132,6 @@ export class AigcService {
     const { userId } = uwb;
 
     const today = new Date().toISOString().split('T')[0];
-    // const digest = await this.prisma.userDigest.findUnique({
-    //   where: {
-    //     userId_date_topicKey: {
-    //       userId,
-    //       date: today,
-    //       topicKey: meta.topics[0].key,
-    //     },
-    //   },
-    // });
-
-    // 如果该 topic 下已有摘要，进行增量总结
-    // if (digest) {
-    //   const dContent = await this.prisma.aigcContent.findUnique({
-    //     where: { id: digest.contentId },
-    //     include: { inputs: true },
-    //   });
-
-    //   // 如果该 digest 输入的 content 已包含新的 content，则不做任何增量总结
-    //   if (dContent.inputIds.includes(content.id)) {
-    //     this.logger.log(
-    //       `digest ${digest.id} already contains content ${content.id}`,
-    //     );
-    //     return;
-    //   }
-
-    //   const combinedContent = await this.llmService.summarizeMultipleWeblink([
-    //     ...dContent.inputs,
-    //     content,
-    //   ]);
-
-    //   // 更新 aigc 依赖关系
-    //   this.prisma.$transaction(async (tx) => {
-    //     await tx.aigcContent.update({
-    //       where: { id: dContent.id },
-    //       data: { ...combinedContent, inputIds: { push: content.id } },
-    //     });
-    //     await tx.aigcContent.update({
-    //       where: { id: content.id },
-    //       data: { outputIds: { push: dContent.id } },
-    //     });
-    //   });
-
-    //   return;
-    // }
 
     // 创建新的 digest 内容及其对应的记录
     this.prisma.$transaction(async (tx) => {
@@ -263,54 +216,6 @@ export class AigcService {
     });
   }
 
-  // /**
-  //  * Dispatch feed to target users.
-  //  * @param content aigc content
-  //  * @returns
-  //  */
-  // private async dispatchFeed(param: {
-  //   weblink: Weblink;
-  //   meta: ContentMeta;
-  //   content: AIGCContent;
-  // }) {
-  //   const { weblink, meta, content } = param;
-
-  //   // topic 管理先简单点，就用 key 去匹配
-  //   // 介绍文案先前端写死
-  //   // await this.ensureTopics(meta);
-
-  //   // Find users related to this content
-  //   const userIds = await this.prisma.userPreference.findMany({
-  //     select: { userId: true },
-  //     where: {
-  //       topicKey: { in: meta.topics.map((t) => t.key) },
-  //       score: { gte: 0 }, // TODO: 设计更合适的推荐门槛
-  //     },
-  //   });
-
-  //   // Check if these users have read this source
-  //   const readLinkUsers = await this.prisma.userWeblink.findMany({
-  //     select: { userId: true },
-  //     where: {
-  //       url: weblink.url,
-  //       userId: { in: userIds.map((u) => u.userId) },
-  //     },
-  //   });
-  //   const readUserSet = new Set(readLinkUsers.map((elem) => elem.userId));
-  //   const unreadUsers = userIds.filter((u) => !readUserSet.has(u.userId));
-
-  //   // Add feed records for unread users
-  //   if (unreadUsers.length > 0) {
-  //     this.logger.log(`add feed ${content.id} to users: ${unreadUsers}`);
-  //     await this.prisma.userFeed.createMany({
-  //       data: unreadUsers.map((u) => ({
-  //         userId: u.userId,
-  //         contentId: content.id,
-  //       })),
-  //     });
-  //   }
-  // }
-
   /**
    * 处理全局内容流程: 应用内容策略，分发 feed
    * @param doc
@@ -318,11 +223,10 @@ export class AigcService {
    */
   async runContentFlow(param: {
     doc: Document;
-    link: WebLinkDTO;
     uwb: UserWeblink;
     weblink: Weblink;
   }) {
-    const { doc, weblink, uwb } = param;
+    const { doc, uwb, weblink } = param;
     let meta: ContentMeta;
 
     if (!weblink.contentMeta) {
@@ -335,14 +239,14 @@ export class AigcService {
         );
         return;
       }
-      if (shouldRunIndexPipeline(meta)) {
-        await this.llmService.indexPipelineFromLink(weblink.id, doc);
-      }
-      await this.prisma.weblink.update({
-        where: { id: weblink.id },
-        data: { contentMeta: JSON.stringify(meta), indexStatus: 'finish' },
-      });
-      await this.ensureTopics(meta);
+
+      await Promise.all([
+        this.prisma.weblink.update({
+          where: { id: weblink.id },
+          data: { contentMeta: JSON.stringify(meta) },
+        }),
+        this.ensureTopics(meta),
+      ]);
     } else {
       meta = JSON.parse(weblink.contentMeta);
     }
@@ -367,7 +271,6 @@ export class AigcService {
       meta,
       user,
     });
-    // await this.dispatchFeed({ ...param, meta, content });
 
     await this.runUserContentFlow({ ...param, meta, content });
   }
@@ -400,7 +303,3 @@ export class AigcService {
     }
   }
 }
-
-function shouldRunIndexPipeline(meta: ContentMeta): boolean {
-  return true;
-}
diff --git a/reflyd/src/app.module.ts b/reflyd/src/app.module.ts
index 36e7a4ba8..1b1778c88 100644
--- a/reflyd/src/app.module.ts
+++ b/reflyd/src/app.module.ts
@@ -9,6 +9,7 @@ import { UserModule } from './user/user.module';
 import { LlmModule } from './llm/llm.module';
 import { AccountModule } from './account/account.module';
 import { WeblinkModule } from './weblink/weblink.module';
+import { RAGModule } from './rag/rag.module';
 import { AigcModule } from './aigc/aigc.module';
 import { ConversationModule } from './conversation/conversation.module';
 
@@ -57,6 +58,7 @@ import { AppController } from './app.controller';
     WeblinkModule,
     LlmModule,
     AigcModule,
+    RAGModule,
   ],
   controllers: [AppController],
 })
diff --git a/reflyd/src/common/common.module.ts b/reflyd/src/common/common.module.ts
index 1ae325afa..a1d4355f2 100644
--- a/reflyd/src/common/common.module.ts
+++ b/reflyd/src/common/common.module.ts
@@ -3,10 +3,11 @@ import { ConfigModule } from '@nestjs/config';
 import { PrismaService } from './prisma.service';
 import { LoggerService } from './logger.service';
 import { MinioService } from './minio.service';
+import { WeaviateService } from './weaviate.service';
 
 @Module({
   imports: [ConfigModule],
-  providers: [PrismaService, MinioService, LoggerService],
-  exports: [PrismaService, MinioService, LoggerService],
+  providers: [PrismaService, MinioService, LoggerService, WeaviateService],
+  exports: [PrismaService, MinioService, LoggerService, WeaviateService],
 })
 export class CommonModule {}
diff --git a/reflyd/src/common/weaviate.dto.ts b/reflyd/src/common/weaviate.dto.ts
new file mode 100644
index 000000000..fe7d8968c
--- /dev/null
+++ b/reflyd/src/common/weaviate.dto.ts
@@ -0,0 +1,37 @@
+export enum ContentType {
+  'weblink',
+}
+
+export interface ContentDataObj {
+  id: string;
+  url: string;
+  type: ContentType;
+  title: string;
+  content: string;
+  vector: number[];
+}
+
+export interface MetadataFilter {
+  url?: string;
+}
+
+export interface HybridSearchParam {
+  tenantId: string;
+  query: string;
+  vector?: number[];
+  filter?: MetadataFilter;
+  limit?: number;
+}
+
+export interface SearchMeta {
+  score: string;
+  explainScore: string;
+}
+
+export interface SearchResult {
+  url: string;
+  type: ContentType;
+  title: string;
+  content: string;
+  _additional: SearchMeta;
+}
diff --git a/reflyd/src/common/weaviate.service.ts b/reflyd/src/common/weaviate.service.ts
new file mode 100644
index 000000000..5018ea65a
--- /dev/null
+++ b/reflyd/src/common/weaviate.service.ts
@@ -0,0 +1,121 @@
+import { Injectable, Logger, OnModuleInit } from '@nestjs/common';
+import { ConfigService } from '@nestjs/config';
+import weaviate, {
+  WeaviateClient,
+  ApiKey,
+  FusionType,
+} from 'weaviate-ts-client';
+
+import {
+  ContentDataObj,
+  HybridSearchParam,
+  SearchResult,
+} from './weaviate.dto';
+
+const reflyContentSchema = {
+  class: 'Content',
+  properties: [
+    {
+      name: 'url',
+      dataType: ['text'],
+    },
+    {
+      name: 'type',
+      dataType: ['text'],
+    },
+    {
+      name: 'title',
+      dataType: ['text'],
+    },
+    {
+      name: 'content',
+      dataType: ['text'],
+    },
+  ],
+  multiTenancyConfig: { enabled: true },
+};
+
+@Injectable()
+export class WeaviateService implements OnModuleInit {
+  private readonly logger = new Logger(WeaviateService.name);
+  private client: WeaviateClient;
+
+  constructor(private configService: ConfigService) {
+    this.client = weaviate.client({
+      scheme: 'https',
+      host: this.configService.getOrThrow('vectorStore.host'),
+      apiKey: new ApiKey(this.configService.getOrThrow('vectorStore.apiKey')),
+    });
+  }
+
+  async onModuleInit() {
+    await this.ensureCollectionExists();
+  }
+
+  async ensureCollectionExists() {
+    let classDefinition = await this.client.schema
+      .classGetter()
+      .withClassName(reflyContentSchema.class)
+      .do();
+
+    if (!classDefinition) {
+      this.logger.log('class definition not found, create new one');
+      classDefinition = await this.client.schema
+        .classCreator()
+        .withClass(reflyContentSchema)
+        .do();
+    }
+
+    this.logger.log(
+      'collection definition: ' + JSON.stringify(classDefinition, null, 2),
+    );
+  }
+
+  async batchSaveData(tenantId: string, data: ContentDataObj[]) {
+    let batcher = this.client.batch.objectsBatcher();
+    for (const obj of data)
+      batcher = batcher.withObject({
+        class: reflyContentSchema.class,
+        properties: {
+          url: obj.url,
+          type: obj.type,
+          title: obj.title,
+          content: obj.content,
+        },
+        id: obj.id,
+        vector: obj.vector,
+        tenant: tenantId,
+      });
+
+    // Flush
+    await batcher.do();
+  }
+
+  async hybridSearch(param: HybridSearchParam): Promise<SearchResult[]> {
+    let getter = this.client.graphql
+      .get()
+      .withTenant(param.tenantId)
+      .withClassName(reflyContentSchema.class)
+      .withHybrid({
+        query: param.query,
+        alpha: 0.5,
+        vector: param.vector,
+        fusionType: FusionType.rankedFusion,
+      })
+      .withLimit(param.limit || 5)
+      .withFields('url type title content _additional { score explainScore }');
+
+    if (param.filter?.url) {
+      getter = getter.withWhere({
+        path: ['url'],
+        operator: 'Equal',
+        valueText: param.filter.url,
+      });
+    }
+
+    const res = await getter.do();
+    this.logger.log('hybrid search result: ' + JSON.stringify(res, null, 2));
+
+    return res.data?.Get?.[reflyContentSchema.class];
+  }
+}
diff --git a/reflyd/src/config/app.config.ts b/reflyd/src/config/app.config.ts
index 923c14a59..80c4584d1 100644
--- a/reflyd/src/config/app.config.ts
+++ b/reflyd/src/config/app.config.ts
@@ -13,7 +13,9 @@ export default () => ({
     weblinkBucket: process.env.MINIO_WEBLINK_BUCKET || 'refly-weblink',
   },
   vectorStore: {
-    vectorDim: parseInt(process.env.REFLY_VEC_DIM) || 1024,
+    host: process.env.WEAVIATE_INSTANCE_URL,
+    apiKey: process.env.WEAVIATE_API_KEY,
+    vectorDim: parseInt(process.env.REFLY_VEC_DIM) || 256,
   },
   serper: {
     apiKey: process.env.SERPER_API_KEY,
diff --git a/reflyd/src/conversation/conversation.controller.ts b/reflyd/src/conversation/conversation.controller.ts
index 534878d78..698389c97 100644
--- a/reflyd/src/conversation/conversation.controller.ts
+++ b/reflyd/src/conversation/conversation.controller.ts
@@ -105,7 +105,7 @@ export class ConversationController {
     res.setHeader('Connection', 'keep-alive');
     res.status(200);
 
-    await this.conversationService.chat(res, convId, req.user.id, body.task);
+    await this.conversationService.chat(res, req.user, convId, body.task);
   }
 
   @UseGuards(JwtAuthGuard)
diff --git a/reflyd/src/conversation/conversation.service.ts b/reflyd/src/conversation/conversation.service.ts
index 5027f7adc..46bc01d12 100644
--- a/reflyd/src/conversation/conversation.service.ts
+++ b/reflyd/src/conversation/conversation.service.ts
@@ -3,7 +3,7 @@ import { Response } from 'express';
 
 import { PrismaService } from '../common/prisma.service';
 import { CreateChatMessageInput, CreateConversationParam } from './dto';
-import { Prisma, ChatMessage } from '@prisma/client';
+import { Prisma, ChatMessage, User } from '@prisma/client';
 import {
   LOCALE,
   QUICK_ACTION_TASK_PAYLOAD,
@@ -93,7 +93,7 @@ export class ConversationService {
     });
   }
 
-  async chat(res: Response, convId: number, userId: number, task: Task) {
+  async chat(res: Response, user: User, convId: number, task: Task) {
     const { taskType, data = {} } = task;
 
     const query = data?.question || '';
@@ -104,18 +104,18 @@ export class ConversationService {
 
     let taskRes: TaskResponse;
     if (taskType === TASK_TYPE.QUICK_ACTION) {
-      taskRes = await this.handleQuickActionTask(res, userId, task);
+      taskRes = await this.handleQuickActionTask(res, user, task);
     } else if (taskType === TASK_TYPE.SEARCH_ENHANCE_ASK) {
       taskRes = await this.handleSearchEnhanceTask(res, task, chatHistory);
     } else {
-      taskRes = await this.handleChatTask(res, userId, task, chatHistory);
+      taskRes = await this.handleChatTask(res, user, task, chatHistory);
     }
     res.end(``);
 
     const newMessages: CreateChatMessageInput[] = [
       {
         type: 'human',
-        userId,
+        userId: user.id,
         conversationId: convId,
         content: query,
         sources: '',
@@ -127,7 +127,7 @@ export class ConversationService {
       },
       {
         type: 'ai',
-        userId,
+        userId: user.id,
         conversationId: convId,
         content: taskRes.answer,
         sources: JSON.stringify(taskRes.sources),
@@ -147,17 +147,17 @@ export class ConversationService {
 
   async handleChatTask(
     res: Response,
-    userId: number,
+    user: User,
     task: Task,
     chatHistory: ChatMessage[],
   ): Promise<TaskResponse> {
-    const locale = task?.locale || LOCALE.EN;
+    const locale = task?.locale || (user.outputLocale as LOCALE) || LOCALE.EN;
 
     const filter: any = {
       must: [
         {
           key: 'userId',
-          match: { value: userId },
+          match: { value: user.uid },
         },
       ],
     };
@@ -193,10 +193,14 @@ export class ConversationService {
           );
 
     const sources = chatFromClientSelector
-      ? await this.weblinkService.parseMultiWeblinks(
+      ? await this.weblinkService.readMultiWeblinks(
           task?.data?.filter?.weblinkList,
         )
-      : await this.llmService.getRetrievalDocs(questionWithContext, filter);
+      : await this.llmService.getRetrievalDocs(
+          user.uid,
+          questionWithContext,
+          filter,
+        );
 
     const { stream } = await this.llmService.chat(
       questionWithContext,
@@ -300,7 +304,7 @@ export class ConversationService {
 
   async handleQuickActionTask(
     res: Response,
-    userId: number,
+    user: User,
     task: Task,
   ): Promise<TaskResponse> {
     const data = task?.data as QUICK_ACTION_TASK_PAYLOAD;
@@ -328,13 +332,10 @@ export class ConversationService {
     if (weblinkList?.length <= 0) return;
 
     // save user mark for each weblink in a non-blocking style
-    this.weblinkService.saveWeblinkUserMarks({
-      userId,
-      weblinkList,
-    });
+    this.weblinkService.saveWeblinkUserMarks({ userId: user.id, weblinkList });
 
     // 基于一组网页做总结，先获取网页内容
-    const docs = await this.weblinkService.parseMultiWeblinks(weblinkList);
+    const docs = await this.weblinkService.readMultiWeblinks(weblinkList);
 
     let stream: IterableReadableStream<BaseMessageChunk>;
     if (data?.actionType === QUICK_ACTION_TYPE.SUMMARY) {
@@ -361,7 +362,7 @@ export class ConversationService {
     const getUserQuestion = (actionType: QUICK_ACTION_TYPE) => {
       switch (actionType) {
         case QUICK_ACTION_TYPE.SUMMARY: {
-          return '总结网页';
+          return '总结网页'; // TODO: 国际化
         }
       }
     };
diff --git a/reflyd/src/llm/llm.controller.ts b/reflyd/src/llm/llm.controller.ts
index 3e535b0c5..4b5bdc163 100644
--- a/reflyd/src/llm/llm.controller.ts
+++ b/reflyd/src/llm/llm.controller.ts
@@ -29,7 +29,7 @@ export class LLMController {
     this.logger.log(`applyStrategy: ${body}`);
 
     const { url } = body;
-    const doc = await this.weblinkService.parseWebLinkContent(url); // 处理错误边界
+    const doc = await this.weblinkService.readWebLinkContent(url); // 处理错误边界
     const res = await this.llmService.applyStrategy(doc);
 
     return res;
@@ -41,7 +41,7 @@ export class LLMController {
     this.logger.log(`applyStrategy: ${body}`);
 
     const { url } = body;
-    const doc = await this.weblinkService.parseWebLinkContent(url); // 处理错误边界
+    const doc = await this.weblinkService.readWebLinkContent(url); // 处理错误边界
     const res = await this.llmService.applyStrategy(doc);
 
     return res;
@@ -53,7 +53,7 @@ export class LLMController {
     this.logger.log(`extractContentMeta: ${body}`);
 
     const { url } = body;
-    const doc = await this.weblinkService.parseWebLinkContent(url); // 处理错误边界
+    const doc = await this.weblinkService.readWebLinkContent(url); // 处理错误边界
     const res = await this.llmService.extractContentMeta(doc);
 
     return res;
@@ -70,7 +70,7 @@ export class LLMController {
     const { urls } = body;
     const contentList = await Promise.all(
       urls.map(async (item) => {
-        const doc = await this.weblinkService.parseWebLinkContent(item); // 处理错误边界
+        const doc = await this.weblinkService.readWebLinkContent(item); // 处理错误边界
 
         // TODO: 这里需要结合 meta + content 来进行多个网页的总结
         const contentMeta = await this.llmService.extractContentMeta(doc);
diff --git a/reflyd/src/llm/llm.module.ts b/reflyd/src/llm/llm.module.ts
index 10d888b62..112e27b28 100644
--- a/reflyd/src/llm/llm.module.ts
+++ b/reflyd/src/llm/llm.module.ts
@@ -2,9 +2,10 @@ import { Module } from '@nestjs/common';
 import { ConfigModule } from '@nestjs/config';
 import { LlmService } from './llm.service';
 import { CommonModule } from '../common/common.module';
+import { RAGModule } from '../rag/rag.module';
 
 @Module({
-  imports: [ConfigModule, CommonModule],
+  imports: [ConfigModule, CommonModule, RAGModule],
   providers: [LlmService],
   exports: [LlmService],
 })
diff --git a/reflyd/src/llm/llm.service.spec.ts b/reflyd/src/llm/llm.service.spec.ts
index 4683ac1e5..0f14c257b 100644
--- a/reflyd/src/llm/llm.service.spec.ts
+++ b/reflyd/src/llm/llm.service.spec.ts
@@ -26,7 +26,7 @@ describe('LlmService', () => {
 
   it('extractContentMeta', async () => {
     const url = 'https://paulgraham.com/vcsqueeze.html';
-    const doc = await weblinkService.parseWebLinkContent(url);
+    const doc = await weblinkService.readWebLinkContent(url);
     const res = await service.extractContentMeta(doc);
     expect(res).toEqual({});
   });
diff --git a/reflyd/src/llm/llm.service.ts b/reflyd/src/llm/llm.service.ts
index 98039b558..299d84bb7 100644
--- a/reflyd/src/llm/llm.service.ts
+++ b/reflyd/src/llm/llm.service.ts
@@ -34,7 +34,9 @@ import { categoryList } from '../prompts/utils/category';
 import { Source } from '../types/weblink';
 import { SearchResultContext } from '../types/search';
 import { PrismaService } from '../common/prisma.service';
-import { LOCALE } from 'src/types/task';
+import { LOCALE } from '../types/task';
+import { RAGService } from '../rag/rag.service';
+import { SearchResult } from '../common/weaviate.dto';
 
 @Injectable()
 export class LlmService implements OnModuleInit {
@@ -51,6 +53,7 @@ export class LlmService implements OnModuleInit {
   constructor(
     private prisma: PrismaService,
     private configService: ConfigService,
+    private ragService: RAGService,
   ) {}
 
   async onModuleInit() {
@@ -462,19 +465,24 @@ export class LlmService implements OnModuleInit {
     });
   }
 
-  async getRetrievalDocs(query: string, filter?: any) {
-    this.logger.log(
-      `activated with query: ${query}, filter: ${JSON.stringify(filter)}`,
-    );
+  async getRetrievalDocs(uid: string, query: string, url?: string) {
+    this.logger.log(`uid: ${uid}, activated with query: ${query}, url: ${url}`);
 
-    const retrievalResults = await this.retrieval(query, filter);
+    const retrievalResults: SearchResult[] = await this.ragService.retrieve({
+      tenantId: uid,
+      query,
+      filter: { url },
+    });
 
-    this.logger.log('retrievalResults', retrievalResults);
+    this.logger.log('retrievalResults: ' + JSON.stringify(retrievalResults));
 
     const retrievedDocs = retrievalResults.map((res) => ({
-      metadata: res?.metadata,
-      pageContent: res?.pageContent as string,
-      score: res?.score, // similarity score
+      metadata: {
+        url: res.url,
+        title: res.title,
+      },
+      pageContent: res.content,
+      score: parseFloat(res._additional.score) || 0,
     }));
 
     return retrievedDocs;
diff --git a/reflyd/src/rag/rag.module.ts b/reflyd/src/rag/rag.module.ts
new file mode 100644
index 000000000..0bc591923
--- /dev/null
+++ b/reflyd/src/rag/rag.module.ts
@@ -0,0 +1,11 @@
+import { Module } from '@nestjs/common';
+import { ConfigModule } from '@nestjs/config';
+import { CommonModule } from '../common/common.module';
+import { RAGService } from './rag.service';
+
+@Module({
+  imports: [ConfigModule, CommonModule],
+  providers: [RAGService],
+  exports: [RAGService],
+})
+export class RAGModule {}
diff --git a/reflyd/src/rag/rag.service.spec.ts b/reflyd/src/rag/rag.service.spec.ts
new file mode 100644
index 000000000..e5be4b104
--- /dev/null
+++ b/reflyd/src/rag/rag.service.spec.ts
@@ -0,0 +1,18 @@
+import { Test, TestingModule } from '@nestjs/testing';
+import { RAGService } from './rag.service';
+
+describe('RAGService', () => {
+  let service: RAGService;
+
+  beforeEach(async () => {
+    const module: TestingModule = await Test.createTestingModule({
+      providers: [RAGService],
+    }).compile();
+
+    service = module.get<RAGService>(RAGService);
+  });
+
+  it('should be defined', () => {
+    expect(service).toBeDefined();
+  });
+});
diff --git a/reflyd/src/rag/rag.service.ts b/reflyd/src/rag/rag.service.ts
new file mode 100644
index 000000000..0c4dc1dfe
--- /dev/null
+++ b/reflyd/src/rag/rag.service.ts
@@ -0,0 +1,109 @@
+import { Injectable } from '@nestjs/common';
+import { ConfigService } from '@nestjs/config';
+import avro from 'avsc';
+import { OpenAIEmbeddings } from '@langchain/openai';
+import { Document } from '@langchain/core/documents';
+import { TokenTextSplitter } from 'langchain/text_splitter';
+import { generateUuid5 } from 'weaviate-ts-client';
+
+import { MinioService } from '../common/minio.service';
+import { PrismaService } from '../common/prisma.service';
+import { WeaviateService } from '../common/weaviate.service';
+import {
+  ContentDataObj,
+  ContentType,
+  HybridSearchParam,
+} from '../common/weaviate.dto';
+
+const READER_URL = 'https://r.jina.ai/';
+
+export const ChunkAvroType = avro.Type.forSchema({
+  type: 'record',
+  name: 'Chunk',
+  fields: [
+    { name: 'id', type: 'string' },
+    { name: 'url', type: 'string' },
+    { name: 'type', type: 'string' },
+    { name: 'title', type: 'string' },
+    { name: 'content', type: 'string' },
+    { name: 'vector', type: { type: 'array', items: 'float' } },
+  ],
+});
+
+export const ContentAvroType = avro.Type.forSchema({
+  type: 'record',
+  name: 'ContentChunks',
+  fields: [
+    {
+      name: 'chunks',
+      type: { type: 'array', items: ChunkAvroType },
+    },
+  ],
+});
+
+export const PARSER_VERSION = '20240424';
+
+@Injectable()
+export class RAGService {
+  private embeddings: OpenAIEmbeddings;
+  private splitter: TokenTextSplitter;
+
+  constructor(
+    private config: ConfigService,
+    private minio: MinioService,
+    private prisma: PrismaService,
+    private weaviate: WeaviateService,
+  ) {
+    this.embeddings = new OpenAIEmbeddings({
+      modelName: 'text-embedding-3-large',
+      batchSize: 512,
+      dimensions: this.config.getOrThrow('vectorStore.vectorDim'),
+      timeout: 5000,
+      maxRetries: 3,
+    });
+    this.splitter = new TokenTextSplitter({
+      encodingName: 'cl100k_base',
+      chunkSize: 800,
+      chunkOverlap: 400,
+    });
+  }
+
+  async parseWebpage(url: string): Promise<Document> {
+    // TODO: error handling
+    const response = await fetch(READER_URL + url);
+    const text = await response.text();
+    return { pageContent: text, metadata: {} };
+  }
+
+  async indexContent(param: {
+    url: string;
+    text?: string;
+  }): Promise<ContentDataObj[]> {
+    const { url, text } = param;
+
+    const chunks = await this.splitter.splitText(text);
+    const chunkEmbeds = await this.embeddings.embedDocuments(chunks);
+
+    const dataObjs: ContentDataObj[] = [];
+    for (let i = 0; i < chunks.length; i++) {
+      dataObjs.push({
+        id: generateUuid5(`${url}-${i}`),
+        url,
+        type: ContentType.weblink,
+        title: chunks[i],
+        content: chunks[i],
+        vector: chunkEmbeds[i],
+      });
+    }
+
+    return dataObjs;
+  }
+
+  async saveDataForUser(uid: string, objList: ContentDataObj[]) {
+    await this.weaviate.batchSaveData(uid, objList);
+  }
+
+  async retrieve(param: HybridSearchParam) {
+    return this.weaviate.hybridSearch(param);
+  }
+}
diff --git a/reflyd/src/utils/id.ts b/reflyd/src/utils/id.ts
new file mode 100644
index 000000000..a3a679da9
--- /dev/null
+++ b/reflyd/src/utils/id.ts
@@ -0,0 +1,9 @@
+import { createId } from '@paralleldrive/cuid2';
+
+export function genUID(): string {
+  return 'u-' + createId();
+}
+
+export function genLinkID(): string {
+  return 'l-' + createId();
+}
diff --git a/reflyd/src/weblink/dto.ts b/reflyd/src/weblink/dto.ts
index 3341384a4..72a444ed7 100644
--- a/reflyd/src/weblink/dto.ts
+++ b/reflyd/src/weblink/dto.ts
@@ -1,4 +1,5 @@
 import { ApiProperty, ApiPropertyOptional } from '@nestjs/swagger';
+import { Document } from '@langchain/core/documents';
 import { IndexStatus } from '@prisma/client';
 
 export class WebLinkDTO {
@@ -33,9 +34,11 @@ export class WebLinkDTO {
   pageContent?: string; // 反爬网站前端传入
 
   @ApiPropertyOptional()
-  storageKey?: string; // 前端上传 html 拿到的 key
+  storageKey?: string; // 前端上传 html 拿到的 object key
 
   userId?: number;
+
+  parsedDoc?: Document; // 服务端解析出的 Document
 }
 
 export class StoredWebLink extends WebLinkDTO {
diff --git a/reflyd/src/weblink/weblink.controller.ts b/reflyd/src/weblink/weblink.controller.ts
index 3ea266757..981c53503 100644
--- a/reflyd/src/weblink/weblink.controller.ts
+++ b/reflyd/src/weblink/weblink.controller.ts
@@ -38,7 +38,7 @@ export class WeblinkController {
   @Post('store')
   async store(@Request() req, @Body() body: StoreWebLinkParam) {
     this.logger.log(`user: ${req.user.id}, store link: ${body}`);
-    await this.weblinkService.storeLinks(req.user.id, body.data);
+    await this.weblinkService.storeLinks(req.user.uid, body.data);
     return { success: true };
   }
 
@@ -47,7 +47,7 @@ export class WeblinkController {
   async getWebContent(@Query('url') url) {
     this.logger.log(`getWebContent, ${url}`);
 
-    const parseContent = await this.weblinkService.parseWebLinkContent(url); // 处理错误边界
+    const parseContent = await this.weblinkService.readWebLinkContent(url); // 处理错误边界
     return parseContent;
   }
 
diff --git a/reflyd/src/weblink/weblink.module.ts b/reflyd/src/weblink/weblink.module.ts
index 64557de42..d5382f3db 100644
--- a/reflyd/src/weblink/weblink.module.ts
+++ b/reflyd/src/weblink/weblink.module.ts
@@ -8,6 +8,7 @@ import { WeblinkService } from './weblink.service';
 import { WeblinkProcessor } from './weblink.processor';
 import { CommonModule } from '../common/common.module';
 import { AigcModule } from '../aigc/aigc.module';
+import { RAGModule } from '../rag/rag.module';
 import { QUEUE_STORE_LINK } from '../utils/const';
 
 @Module({
@@ -15,6 +16,7 @@ import { QUEUE_STORE_LINK } from '../utils/const';
     ConfigModule,
     CommonModule,
     AigcModule,
+    RAGModule,
     BullModule.registerQueue({ name: QUEUE_STORE_LINK }),
   ],
   controllers: [WeblinkController],
diff --git a/reflyd/src/weblink/weblink.service.ts b/reflyd/src/weblink/weblink.service.ts
index d09ee686f..c3bc00c76 100644
--- a/reflyd/src/weblink/weblink.service.ts
+++ b/reflyd/src/weblink/weblink.service.ts
@@ -1,26 +1,31 @@
 import { Injectable } from '@nestjs/common';
+import { ConfigService } from '@nestjs/config';
 import { Prisma, Weblink } from '@prisma/client';
 import { Queue } from 'bull';
 import { LRUCache } from 'lru-cache';
 import { InjectQueue } from '@nestjs/bull';
 import * as cheerio from 'cheerio';
 import { Document } from '@langchain/core/documents';
-import { CheerioWebBaseLoader } from 'langchain/document_loaders/web/cheerio';
 
 import { LoggerService } from '../common/logger.service';
 import { PrismaService } from '../common/prisma.service';
 import { MinioService } from '../common/minio.service';
+import {
+  RAGService,
+  ContentAvroType,
+  PARSER_VERSION,
+} from '../rag/rag.service';
 import { AigcService } from '../aigc/aigc.service';
 import { WebLinkDTO } from './dto';
 import { getExpectedTokenLenContent } from '../utils/token';
-import { PageMeta, Source } from '../types/weblink';
+import { Source } from '../types/weblink';
 import { QUEUE_STORE_LINK } from '../utils/const';
-import { ConfigService } from '@nestjs/config';
 import { streamToString } from '../utils/stream';
+import { genLinkID } from '../utils/id';
 
 @Injectable()
 export class WeblinkService {
-  private cache: LRUCache<string, string>; // url -> document
+  private cache: LRUCache<string, Document>; // url -> parsed document (in markdown)
   private bucketName: string;
 
   constructor(
@@ -28,6 +33,7 @@ export class WeblinkService {
     private prisma: PrismaService,
     private minio: MinioService,
     private configService: ConfigService,
+    private ragService: RAGService,
     private aigcService: AigcService,
     @InjectQueue(QUEUE_STORE_LINK) private indexQueue: Queue<WebLinkDTO>,
   ) {
@@ -78,19 +84,6 @@ export class WeblinkService {
     }
   }
 
-  async createNewLink(link: WebLinkDTO, pageMeta: PageMeta) {
-    return this.prisma.weblink.create({
-      data: {
-        url: link.url,
-        indexStatus: 'processing',
-        pageContent: '', // deprecated, always empty
-        storageKey: link.storageKey,
-        pageMeta: JSON.stringify(pageMeta),
-        contentMeta: '',
-      },
-    });
-  }
-
   async getUserHistory(params: {
     skip?: number;
     take?: number;
@@ -107,65 +100,39 @@ export class WeblinkService {
    * @param {string} url - The URL of the webpage to parse
    * @return {Promise<Document>} A Promise that resolves to the parsed document
    */
-  async parseWebLinkContent(url: string): Promise<Document> {
+  async readWebLinkContent(url: string): Promise<Document> {
     // Check if the document is in the cache
     if (this.cache.has(url)) {
       this.logger.log(`in-mem cache hit: ${url}`);
-      return JSON.parse(this.cache.get(url));
+      return this.cache.get(url);
     }
 
     // Check if the document is in the database
     const weblink = await this.prisma.weblink.findUnique({
-      select: { storageKey: true, pageMeta: true },
+      select: { parsedDocStorageKey: true, pageMeta: true },
       where: { url },
     });
     if (weblink) {
       this.logger.log(`found weblink in db: ${url}`);
       const content = await this.minio.getObject(
         this.bucketName,
-        weblink.storageKey,
+        weblink.parsedDocStorageKey,
       );
 
       const doc = new Document({
         pageContent: await streamToString(content),
         metadata: JSON.parse(weblink.pageMeta),
       });
-      this.cache.set(url, JSON.stringify(doc));
+      this.cache.set(url, doc);
       return doc;
     }
 
     // Finally tries to fetch the content from the web
-    try {
-      const loader = new CheerioWebBaseLoader(url, {
-        maxRetries: 3,
-        timeout: 5000,
-      });
+    const doc = await this.ragService.parseWebpage(url);
 
-      // customized webpage loading
-      // TODO: remove this in the future
-      const $ = await loader.scrape();
-      // remove all styles and scripts tag
-      $('script, style, plasmo-csui, img, svg, meta, link').remove();
-      // remove comments blocks
-      $('body')
-        .contents()
-        .each((i, node) => {
-          if (node.type === 'comment') {
-            $(node).remove();
-          }
-        });
+    this.cache.set(url, doc);
 
-      // only get meaning content
-      const pageContent = $.html();
-      const title = $('title').text();
-      const source = loader.webPath;
-      const doc = { pageContent, metadata: { title, source } };
-      this.cache.set(url, JSON.stringify(doc));
-      return doc;
-    } catch (err) {
-      this.logger.error(`process url ${url} failed: ${err}`);
-      return null;
-    }
+    return doc;
   }
 
   /**
@@ -173,7 +140,7 @@ export class WeblinkService {
    * @param pageContent raw html page content
    * @returns nothing
    */
-  async downloadWebLinkContent(
+  async directParseWebLinkContent(
     url: string,
     storageKey: string,
   ): Promise<Document> {
@@ -198,7 +165,7 @@ export class WeblinkService {
    * @param weblinkList input weblinks
    * @returns langchain documents
    */
-  async parseMultiWeblinks(weblinkList: Source[]): Promise<Document[]> {
+  async readMultiWeblinks(weblinkList: Source[]): Promise<Document[]> {
     // 处理 token 窗口，一共给 6K 窗口用于问答，平均分到每个网页，保障可用性
     const avgTokenLen = 6000 / weblinkList?.length;
 
@@ -212,7 +179,7 @@ export class WeblinkService {
           }));
         }
 
-        const { pageContent, metadata } = await this.parseWebLinkContent(
+        const { pageContent, metadata } = await this.readWebLinkContent(
           item.metadata?.source,
         );
         return [
@@ -308,11 +275,64 @@ export class WeblinkService {
         totalReadTime: { increment: link.readTime || 0 },
       },
     });
+
     this.logger.log(`process link for user finish`);
 
     return uwb;
   }
 
+  async createNewWeblink(link: WebLinkDTO): Promise<Weblink> {
+    // Fetch doc and store in cache for later use
+    const doc = link.storageKey
+      ? await this.directParseWebLinkContent(link.url, link.storageKey)
+      : await this.readWebLinkContent(link.url);
+    this.cache.set(link.url, doc);
+
+    // Upload parsed doc to minio
+    // TODO: sha256 of link url
+    const parsedDocStorageKey = `docs/${link.url}.md`;
+    const res = await this.minio.putObject(
+      this.configService.get('minio.weblinkBucket'),
+      parsedDocStorageKey,
+      doc.pageContent,
+    );
+    this.logger.log('upload parsed doc to minio res: ' + JSON.stringify(res));
+
+    return this.prisma.weblink.create({
+      data: {
+        url: link.url,
+        linkId: genLinkID(),
+        indexStatus: 'processing',
+        pageContent: '', // deprecated, always empty
+        storageKey: link.storageKey,
+        parsedDocStorageKey,
+        pageMeta: JSON.stringify({ title: link.title, source: link.url }),
+        contentMeta: '',
+      },
+    });
+  }
+
+  async indexWeblink(weblink: Weblink, doc: Document) {
+    const dataObjs = await this.ragService.indexContent({
+      url: weblink.url,
+      text: doc.pageContent,
+    });
+
+    const buf = ContentAvroType.toBuffer({ chunks: dataObjs });
+    const chunkStorageKey = `content-${PARSER_VERSION}.avro`;
+    const res = await this.minio.putObject(
+      this.configService.get('minio.weblinkBucket'),
+      chunkStorageKey,
+      buf,
+    );
+    this.logger.log('upload chunks to minio res: ' + JSON.stringify(res));
+
+    return this.prisma.weblink.update({
+      where: { id: weblink.id },
+      data: { chunkStorageKey },
+    });
+  }
+
   async processLinkFromStoreQueue(link: WebLinkDTO) {
     this.logger.log(`process link from queue: ${JSON.stringify(link)}`);
 
@@ -325,33 +345,17 @@ export class WeblinkService {
 
     // Link not found
     if (!weblink) {
-      if (!link.storageKey) {
-        return this.logger.warn(
-          `storageKey not provided for ${link.url}, skip`,
-        );
-      }
-
-      weblink = await this.createNewLink(link, {
-        title: link.title,
-        source: link.url,
-      });
+      weblink = await this.createNewWeblink(link);
     }
 
-    // Fetch doc and store in cache for later use
-    const doc = link.storageKey
-      ? await this.downloadWebLinkContent(link.url, link.storageKey)
-      : await this.parseWebLinkContent(link.url);
-
-    this.cache.set(link.url, JSON.stringify(doc));
+    const doc = await this.readWebLinkContent(link.url);
 
-    // TODO: 优化 page content 的清洗逻辑
-    const $ = cheerio.load(doc.pageContent);
-    doc.pageContent = $.text();
+    await this.indexWeblink(weblink, doc);
 
     // 处理单个用户的访问记录
     const uwb = await this.processLinkForUser(link, weblink);
 
-    await this.aigcService.runContentFlow({ doc, link, uwb, weblink });
+    await this.aigcService.runContentFlow({ doc, uwb, weblink });
   }
 }
 
diff --git a/reflyd/yarn.lock b/reflyd/yarn.lock
index c62f2caa6..2c2060169 100644
--- a/reflyd/yarn.lock
+++ b/reflyd/yarn.lock
@@ -400,6 +400,11 @@
   resolved "https://registry.yarnpkg.com/@eslint/js/-/js-8.56.0.tgz#ef20350fec605a7f7035a01764731b2de0f3782b"
   integrity sha512-gMsVel9D7f2HLkBma9VbtzZRehRogVRfbr++f06nL2vnCGCNlzOD+/MUov/F4p8myyAHspEhVobgjpX64q5m6A==
 
+"@graphql-typed-document-node/core@^3.1.1":
+  version "3.2.0"
+  resolved "https://registry.yarnpkg.com/@graphql-typed-document-node/core/-/core-3.2.0.tgz#5f3d96ec6b2354ad6d8a28bf216a1d97b5426861"
+  integrity sha512-mB9oAsNCm9aM3/SOv4YtBMqZbYj10R7dkq8byBqxGY/ncFwhf2oQzMV+LCRlWoDSEBJ3COiR1yeDvMtsoOsuFQ==
+
 "@grpc/grpc-js@^1.7.1":
   version "1.10.6"
   resolved "https://registry.yarnpkg.com/@grpc/grpc-js/-/grpc-js-1.10.6.tgz#1e3eb1af911dc888fbef7452f56a7573b8284d54"
@@ -1009,6 +1014,11 @@
   dependencies:
     tslib "2.5.3"
 
+"@noble/hashes@^1.1.5":
+  version "1.4.0"
+  resolved "https://registry.yarnpkg.com/@noble/hashes/-/hashes-1.4.0.tgz#45814aa329f30e4fe0ba49426f49dfccdd066426"
+  integrity sha512-V1JJ1WTRUqHHrOSh597hURcMqVKVGL/ea3kv0gSnEdsEZ0/+VyPghM1lMNGc00z7CIQorSvbKpuJkxvuHbvdbg==
+
 "@nodelib/fs.scandir@2.1.5":
   version "2.1.5"
   resolved "https://registry.yarnpkg.com/@nodelib/fs.scandir/-/fs.scandir-2.1.5.tgz#7619c2eb21b25483f6d167548b4cfd5a7488c3d5"
@@ -1724,6 +1734,13 @@
   dependencies:
     "@opentelemetry/core" "^1.1.0"
 
+"@paralleldrive/cuid2@^2.2.2":
+  version "2.2.2"
+  resolved "https://registry.yarnpkg.com/@paralleldrive/cuid2/-/cuid2-2.2.2.tgz#7f91364d53b89e2c9cb9e02e8dd0f129e834455f"
+  integrity sha512-ZOBkgDwEdoYVlSeRbYYXs0S9MejQofiVYoTbKzy/6GQa39/q5tQU2IX46+shYnUkpEl3wc+J6wRlar7r2EK2xA==
+  dependencies:
+    "@noble/hashes" "^1.1.5"
+
 "@prisma/client@5":
   version "5.12.1"
   resolved "https://registry.yarnpkg.com/@prisma/client/-/client-5.12.1.tgz#c26a674fea76754b3a9e8b90a11e617f90212f76"
@@ -2916,6 +2933,11 @@ available-typed-arrays@^1.0.7:
   dependencies:
     possible-typed-array-names "^1.0.0"
 
+avsc@^5.7.7:
+  version "5.7.7"
+  resolved "https://registry.yarnpkg.com/avsc/-/avsc-5.7.7.tgz#8d1b5fd85904cc96a1e439450633ff33f4aff57b"
+  integrity sha512-9cYNccliXZDByFsFliVwk5GvTq058Fj513CiR4E60ndDwmuXzTJEp/Bp8FyuRmGyYupLjHLs+JA9/CBoVS4/NQ==
+
 babel-jest@^29.7.0:
   version "29.7.0"
   resolved "https://registry.yarnpkg.com/babel-jest/-/babel-jest-29.7.0.tgz#f4369919225b684c56085998ac63dbd05be020d5"
@@ -3553,6 +3575,13 @@ cron-parser@^4.2.1:
   dependencies:
     luxon "^3.2.1"
 
+cross-fetch@^3.1.5:
+  version "3.1.8"
+  resolved "https://registry.yarnpkg.com/cross-fetch/-/cross-fetch-3.1.8.tgz#0327eba65fd68a7d119f8fb2bf9334a1a7956f82"
+  integrity sha512-cvA+JwZoU0Xq+h6WkMvAUqPEYy92Obet6UdKLfW60qn99ftItKjB5T+BkyWOFWe2pUyfQ+IJHmpOTznqk1M6Kg==
+  dependencies:
+    node-fetch "^2.6.12"
+
 cross-spawn@^7.0.0, cross-spawn@^7.0.2, cross-spawn@^7.0.3:
   version "7.0.3"
   resolved "https://registry.yarnpkg.com/cross-spawn/-/cross-spawn-7.0.3.tgz#f73a85b9d5d41d045551c177e2882d4ac85728a6"
@@ -4113,6 +4142,11 @@ external-editor@^3.0.3:
     iconv-lite "^0.4.24"
     tmp "^0.0.33"
 
+extract-files@^9.0.0:
+  version "9.0.0"
+  resolved "https://registry.yarnpkg.com/extract-files/-/extract-files-9.0.0.tgz#8a7744f2437f81f5ed3250ed9f1550de902fe54a"
+  integrity sha512-CvdFfHkC95B4bBBk36hcEmvdR2awOdhhVUYH6S/zrVj3477zven/fJMYg7121h4T1xHZC+tetUpubpAhxwI7hQ==
+
 fast-copy@^3.0.0:
   version "3.0.2"
   resolved "https://registry.yarnpkg.com/fast-copy/-/fast-copy-3.0.2.tgz#59c68f59ccbcac82050ba992e0d5c389097c9d35"
@@ -4284,6 +4318,15 @@ form-data-encoder@1.7.2:
   resolved "https://registry.yarnpkg.com/form-data-encoder/-/form-data-encoder-1.7.2.tgz#1f1ae3dccf58ed4690b86d87e4f57c654fbab040"
   integrity sha512-qfqtYan3rxrnCk1VYaA4H+Ms9xdpPqvLZa6xmMgFvhO32x7/3J/ExcTd6qpxM0vH2GdMI+poehyBZvqfMTto8A==
 
+form-data@^3.0.0:
+  version "3.0.1"
+  resolved "https://registry.yarnpkg.com/form-data/-/form-data-3.0.1.tgz#ebd53791b78356a99af9a300d4282c4d5eb9755f"
+  integrity sha512-RHkBKtLWUVwd7SqRIvCZMEvAMoGUp0XU+seQiZejj0COz3RI3hWP4sCv3gZWWLjJTd7rGwcsF5eKZGii0r/hbg==
+  dependencies:
+    asynckit "^0.4.0"
+    combined-stream "^1.0.8"
+    mime-types "^2.1.12"
+
 form-data@^4.0.0:
   version "4.0.0"
   resolved "https://registry.yarnpkg.com/form-data/-/form-data-4.0.0.tgz#93919daeaf361ee529584b9b31664dc12c9fa452"
@@ -4526,6 +4569,21 @@ graphemer@^1.4.0:
   resolved "https://registry.yarnpkg.com/graphemer/-/graphemer-1.4.0.tgz#fb2f1d55e0e3a1849aeffc90c4fa0dd53a0e66c6"
   integrity sha512-EtKwoO6kxCL9WO5xipiHTZlSzBm7WLT627TqC/uVRd0HKmq8NXyebnNYxDoBi7wt8eTWrUrKXCOVaFq9x1kgag==
 
+graphql-request@^5.2.0:
+  version "5.2.0"
+  resolved "https://registry.yarnpkg.com/graphql-request/-/graphql-request-5.2.0.tgz#a05fb54a517d91bb2d7aefa17ade4523dc5ebdca"
+  integrity sha512-pLhKIvnMyBERL0dtFI3medKqWOz/RhHdcgbZ+hMMIb32mEPa5MJSzS4AuXxfI4sRAu6JVVk5tvXuGfCWl9JYWQ==
+  dependencies:
+    "@graphql-typed-document-node/core" "^3.1.1"
+    cross-fetch "^3.1.5"
+    extract-files "^9.0.0"
+    form-data "^3.0.0"
+
+graphql@^16.8.1:
+  version "16.8.1"
+  resolved "https://registry.yarnpkg.com/graphql/-/graphql-16.8.1.tgz#1930a965bef1170603702acdb68aedd3f3cf6f07"
+  integrity sha512-59LZHPdGZVh695Ud9lRzPBVTtlX9ZCV150Er2W43ro37wVof0ctenSaskPPjN7lVTIN8mSZt8PHUNKZuNQUuxw==
+
 grpc-tools@^1.12.4:
   version "1.12.4"
   resolved "https://registry.yarnpkg.com/grpc-tools/-/grpc-tools-1.12.4.tgz#a044c9e8157941033ea7a5f144c2dc9dc4501de4"
@@ -6029,7 +6087,7 @@ node-emoji@1.11.0:
   dependencies:
     lodash "^4.17.21"
 
-node-fetch@^2.0.0, node-fetch@^2.6.1, node-fetch@^2.6.7, node-fetch@^2.6.9:
+node-fetch@^2.0.0, node-fetch@^2.6.1, node-fetch@^2.6.12, node-fetch@^2.6.7, node-fetch@^2.6.9:
   version "2.7.0"
   resolved "https://registry.yarnpkg.com/node-fetch/-/node-fetch-2.7.0.tgz#d0f0fa6e3e2dc1d27efcd8ad99d550bda94d187d"
   integrity sha512-c4FRfUm/dbcWZ7U+1Wq0AwCyFL+3nt2bEw05wfxSz+DWpWsitgmSgYmy2dQdWyKC1694ELPqMs/YzUSNozLt8A==
@@ -7720,6 +7778,14 @@ wcwidth@^1.0.1:
   dependencies:
     defaults "^1.0.3"
 
+weaviate-ts-client@^2.1.1:
+  version "2.1.1"
+  resolved "https://registry.yarnpkg.com/weaviate-ts-client/-/weaviate-ts-client-2.1.1.tgz#5bf142f928b59be6cf74a5f388fbe03db11e6abc"
+  integrity sha512-d8yc2KnIEIV1beHAU8mhrElT3BoROoXGDsLlqFX8QGx3G+gOiPTRMc7SLy4F17+LvaUaTD0XkHvWX++4iehnsg==
+  dependencies:
+    graphql-request "^5.2.0"
+    uuid "^9.0.1"
+
 web-encoding@^1.1.5:
   version "1.1.5"
   resolved "https://registry.yarnpkg.com/web-encoding/-/web-encoding-1.1.5.tgz#fc810cf7667364a6335c939913f5051d3e0c4864"