DataArcTech
diff --git a/‎config/core/file_management/indexing/bm25_indexing_config.py‎
Lines changed: 1 addition & 5 deletions b/‎config/core/file_management/indexing/bm25_indexing_config.py‎
Lines changed: 1 addition & 5 deletions
diff --git a/‎core/file_management/extractor/graphextractor.py‎
Lines changed: 1 addition & 1 deletion b/‎core/file_management/extractor/graphextractor.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎core/file_management/extractor/hipporag2_extractor.py‎
Lines changed: 85 additions & 24 deletions b/‎core/file_management/extractor/hipporag2_extractor.py‎
Lines changed: 85 additions & 24 deletions
diff --git a/‎core/file_management/indexing/bm25_indexing.py‎
Lines changed: 11 additions & 16 deletions b/‎core/file_management/indexing/bm25_indexing.py‎
Lines changed: 11 additions & 16 deletions
@@ -13,16 +13,12 @@ class BM25IndexerConfig(AbstractConfig):
     # Batch processing configuration
     batch_size: int = Field(
         default=100,
-        description="Number of chunks to accumulate before flushing to index"
+        description="Number of chunks to accumulate before triggering a flush"
     )
     flush_interval: float = Field(
         default=5.0,
         description="Time interval (in seconds) to periodically flush pending chunks"
     )
-    immediate_flush_threshold: int = Field(
-        default=10,
-        description="If pending chunks <= this threshold, flush immediately instead of waiting. Set to 0 to disable."
-    )
 
     def build(self):
         return BM25Indexer(self)
@@ -345,7 +345,7 @@ def is_valid_entity(self, name: str) -> bool:
             return False
 
         # Filter pure numbers
-        if re.match(r'^\d+$', name) or re.match(r'^[\d\s\.,;:!?()\[\]{}""''\-_]+$', name):
+        if re.match(r'^\d+$', name) or re.match(r'^[\d\s\.,;:!?()\[\]{}""''\\-_]+$', name):
             return False
 
         return True
 
@@ -11,6 +11,7 @@
 """
 
 import logging
+import re
 from typing import List, TYPE_CHECKING, Tuple
 
 from core.file_management.extractor.base import ExtractorBase
@@ -19,7 +20,12 @@
     HIPPORAG2_NER_ONE_SHOT_INPUT, HIPPORAG2_NER_ONE_SHOT_OUTPUT,
     HIPPORAG2_NER_ONE_SHOT_INPUT_WITH_TYPES, HIPPORAG2_NER_ONE_SHOT_OUTPUT_WITH_TYPES,
     HIPPORAG2_NER_PROMPT, HIPPORAG2_NER_PROMPT_WITH_TYPES,
-    HIPPORAG2_TRIPLE_SYSTEM, HIPPORAG2_TRIPLE_ONE_SHOT_INPUT, HIPPORAG2_TRIPLE_ONE_SHOT_OUTPUT, HIPPORAG2_TRIPLE_PROMPT
+    HIPPORAG2_TRIPLE_SYSTEM, HIPPORAG2_TRIPLE_ONE_SHOT_INPUT, HIPPORAG2_TRIPLE_ONE_SHOT_OUTPUT, HIPPORAG2_TRIPLE_PROMPT,
+    HIPPORAG2_NER_SYSTEM_ZH, HIPPORAG2_NER_SYSTEM_WITH_TYPES_ZH,
+    HIPPORAG2_NER_ONE_SHOT_INPUT_ZH, HIPPORAG2_NER_ONE_SHOT_OUTPUT_ZH,
+    HIPPORAG2_NER_ONE_SHOT_INPUT_WITH_TYPES_ZH, HIPPORAG2_NER_ONE_SHOT_OUTPUT_WITH_TYPES_ZH,
+    HIPPORAG2_NER_PROMPT_ZH, HIPPORAG2_NER_PROMPT_WITH_TYPES_ZH,
+    HIPPORAG2_TRIPLE_SYSTEM_ZH, HIPPORAG2_TRIPLE_ONE_SHOT_INPUT_ZH, HIPPORAG2_TRIPLE_ONE_SHOT_OUTPUT_ZH, HIPPORAG2_TRIPLE_PROMPT_ZH
 )
 from encapsulation.data_model.schema import Chunk, GraphData
 
@@ -46,6 +52,25 @@ def __init__(self, config: "HippoRAG2ExtractorConfig"):
         self.logger = logging.getLogger(__name__)
         self.entity_types = getattr(config, 'entity_types', None)  # Optional entity types to extract
 
+    def detect_language(self, text: str) -> str:
+        """
+        Detect text language (Chinese or English)
+
+        Args:
+            text: Input text to detect language
+
+        Returns:
+            'zh' for Chinese, 'en' for English
+        """
+        chinese_chars = len(re.findall(r'[\u4e00-\u9fff]', text))
+        total_chars = len(re.sub(r'\s', '', text))
+
+        if total_chars == 0:
+            return 'en'
+
+        chinese_ratio = chinese_chars / total_chars
+        return 'zh' if chinese_ratio > 0.1 else 'en'
+
     async def extract(self, chunk: Chunk) -> GraphData:
         """
         Main extraction method for HippoRAG2
@@ -79,7 +104,7 @@ async def extract_two_stage(self, chunk: Chunk) -> GraphData:
             if not entities:
                 self.logger.warning("No entities extracted, skipping triple extraction")
                 return GraphData()
-            
+            print(entities)
             # Stage 2: Triple Extraction using extracted entities
             triples = await self.extract_triples(chunk.content, entities)
 
@@ -132,7 +157,6 @@ async def extract_triples(self, text: str, entities: List[Tuple[str, str]]) -> L
             # Extract just entity names for the prompt
             entity_names = [entity[0] for entity in entities]
             prompt = self.build_triple_prompt(text, entity_names)
-
             response = await self.llm.achat([{"role": "user", "content": prompt}])
 
             triples = self.parse_triple_response(response)
@@ -147,6 +171,7 @@ async def extract_triples(self, text: str, entities: List[Tuple[str, str]]) -> L
     def build_ner_prompt(self, text: str) -> str:
         """
         Build NER prompt - always outputs entity types in TSV format
+        Supports both Chinese and English
 
         Args:
             text: Input text to extract entities from
@@ -158,37 +183,73 @@ def build_ner_prompt(self, text: str) -> str:
             - If self.entity_types is specified: uses HIPPORAG2_NER_PROMPT_WITH_TYPES
             - If self.entity_types is None: uses HIPPORAG2_NER_PROMPT (LLM auto-determines types)
             - Both formats output entity\ttype TSV format
+            - Language is auto-detected (Chinese or English)
         """
+        # Detect language
+        language = self.detect_language(text)
+
         if self.entity_types:
             # Use entity type-specific prompt (only extract specified types)
             entity_types_str = ', '.join(self.entity_types)
-            return HIPPORAG2_NER_PROMPT_WITH_TYPES.format(
-                system=HIPPORAG2_NER_SYSTEM_WITH_TYPES,
-                entity_types=entity_types_str,
-                example_input=HIPPORAG2_NER_ONE_SHOT_INPUT_WITH_TYPES,
-                example_output=HIPPORAG2_NER_ONE_SHOT_OUTPUT_WITH_TYPES,
-                passage=text
-            )
+            if language == 'zh':
+                return HIPPORAG2_NER_PROMPT_WITH_TYPES_ZH.format(
+                    system=HIPPORAG2_NER_SYSTEM_WITH_TYPES_ZH,
+                    entity_types=entity_types_str,
+                    example_input=HIPPORAG2_NER_ONE_SHOT_INPUT_WITH_TYPES_ZH,
+                    example_output=HIPPORAG2_NER_ONE_SHOT_OUTPUT_WITH_TYPES_ZH,
+                    passage=text
+                )
+            else:
+                return HIPPORAG2_NER_PROMPT_WITH_TYPES.format(
+                    system=HIPPORAG2_NER_SYSTEM_WITH_TYPES,
+                    entity_types=entity_types_str,
+                    example_input=HIPPORAG2_NER_ONE_SHOT_INPUT_WITH_TYPES,
+                    example_output=HIPPORAG2_NER_ONE_SHOT_OUTPUT_WITH_TYPES,
+                    passage=text
+                )
         else:
             # Use auto-type prompt (LLM determines entity types)
-            return HIPPORAG2_NER_PROMPT.format(
-                system=HIPPORAG2_NER_SYSTEM,
-                example_input=HIPPORAG2_NER_ONE_SHOT_INPUT,
-                example_output=HIPPORAG2_NER_ONE_SHOT_OUTPUT,
-                passage=text
-            )
+            if language == 'zh':
+                return HIPPORAG2_NER_PROMPT_ZH.format(
+                    system=HIPPORAG2_NER_SYSTEM_ZH,
+                    example_input=HIPPORAG2_NER_ONE_SHOT_INPUT_ZH,
+                    example_output=HIPPORAG2_NER_ONE_SHOT_OUTPUT_ZH,
+                    passage=text
+                )
+            else:
+                return HIPPORAG2_NER_PROMPT.format(
+                    system=HIPPORAG2_NER_SYSTEM,
+                    example_input=HIPPORAG2_NER_ONE_SHOT_INPUT,
+                    example_output=HIPPORAG2_NER_ONE_SHOT_OUTPUT,
+                    passage=text
+                )
 
     def build_triple_prompt(self, text: str, entities: List[str]) -> str:
-        """Build triple extraction prompt"""
+        """
+        Build triple extraction prompt
+        Supports both Chinese and English
+        """
         entities_str = '\n'.join(entities)
 
-        return HIPPORAG2_TRIPLE_PROMPT.format(
-            system=HIPPORAG2_TRIPLE_SYSTEM,
-            example_input=HIPPORAG2_TRIPLE_ONE_SHOT_INPUT,
-            example_output=HIPPORAG2_TRIPLE_ONE_SHOT_OUTPUT,
-            passage=text,
-            entities=entities_str
-        )
+        # Detect language
+        language = self.detect_language(text)
+
+        if language == 'zh':
+            return HIPPORAG2_TRIPLE_PROMPT_ZH.format(
+                system=HIPPORAG2_TRIPLE_SYSTEM_ZH,
+                example_input=HIPPORAG2_TRIPLE_ONE_SHOT_INPUT_ZH,
+                example_output=HIPPORAG2_TRIPLE_ONE_SHOT_OUTPUT_ZH,
+                passage=text,
+                entities=entities_str
+            )
+        else:
+            return HIPPORAG2_TRIPLE_PROMPT.format(
+                system=HIPPORAG2_TRIPLE_SYSTEM,
+                example_input=HIPPORAG2_TRIPLE_ONE_SHOT_INPUT,
+                example_output=HIPPORAG2_TRIPLE_ONE_SHOT_OUTPUT,
+                passage=text,
+                entities=entities_str
+            )
 
     def parse_ner_response(self, response: str) -> List[Tuple[str, str]]:
         """
 
@@ -29,7 +29,6 @@ def __init__(self, config: "BM25IndexerConfig"):
         # Batch processing configuration
         self.batch_size = config.batch_size
         self.flush_interval = config.flush_interval
-        self.immediate_flush_threshold = config.immediate_flush_threshold
 
         # Async lock to ensure only one coroutine writes to the index
         self._write_lock = asyncio.Lock()
@@ -138,10 +137,12 @@ async def update_index(self, chunks: List[Chunk]) -> List[str]:
         """
         Adds chunks to the pending queue for batch processing.
 
-        Flush strategies:
-        1. If pending chunks >= batch_size: flush immediately
-        2. If pending chunks <= immediate_flush_threshold: flush immediately (for small uploads)
-        3. Otherwise: wait for periodic flush
+        This method is NON-BLOCKING - it adds chunks to the queue and returns immediately.
+        The actual indexing happens in the background flush worker.
+
+        Flush trigger strategies:
+        1. If pending chunks >= batch_size: trigger immediate flush (non-blocking)
+        2. Otherwise: wait for periodic flush
         """
         if not chunks:
             return []
@@ -154,19 +155,13 @@ async def update_index(self, chunks: List[Chunk]) -> List[str]:
         total_pending = len(self._pending_chunks)
         logger.info(f"Added {len(chunks)} chunks to pending queue. Total pending: {total_pending}")
 
-        # Strategy 1: Batch size reached - flush immediately
+        # Strategy 1: Batch size reached - trigger immediate flush (non-blocking)
         if total_pending >= self.batch_size:
-            logger.info(f"Batch size ({self.batch_size}) reached, flushing immediately")
-            return await self._flush_pending_chunks()
-
-        # Strategy 2: Small upload - flush immediately to reduce latency
-        if self.immediate_flush_threshold > 0 and total_pending <= self.immediate_flush_threshold:
-            logger.info(f"Small batch ({total_pending} <= {self.immediate_flush_threshold}), flushing immediately")
-            return await self._flush_pending_chunks()
+            logger.info(f"Batch size ({self.batch_size}) reached, triggering immediate flush")
+            # Create a flush task but don't wait for it
+            asyncio.create_task(self._flush_pending_chunks())
 
-        # Strategy 3: Medium batch - wait for periodic flush
-        logger.info(f"Medium batch ({total_pending}), waiting for periodic flush")
-        # Return the chunk IDs optimistically (they will be indexed later)
+        # Return chunk IDs immediately (they will be indexed by background worker)
         return [chunk.id for chunk in chunks]
 
     async def shutdown(self):