vectorlessflow
diff --git a/‎rust/src/retrieval/complexity/detector.rs‎
Lines changed: 168 additions & 101 deletions b/‎rust/src/retrieval/complexity/detector.rs‎
Lines changed: 168 additions & 101 deletions
@@ -2,143 +2,133 @@
 // SPDX-License-Identifier: Apache-2.0
 
 //! Query complexity detector implementation.
+//!
+//! Uses Pilot's LLM client for accurate complexity classification when available.
+//! Falls back to heuristic rules (keyword + word count) when no LLM client.
 
 use std::collections::HashSet;
 
 use super::QueryComplexity;
 
-/// Configuration for complexity detection.
-#[derive(Debug, Clone)]
-pub struct ComplexityConfig {
-    /// Maximum words for simple query.
-    pub simple_max_words: usize,
-    /// Maximum words for medium query.
-    pub medium_max_words: usize,
-    /// Complexity indicators (words that suggest complex queries).
-    pub complex_indicators: Vec<String>,
-    /// Simple query indicators.
-    pub simple_indicators: Vec<String>,
-}
-
-impl Default for ComplexityConfig {
-    fn default() -> Self {
-        Self {
-            simple_max_words: 5,
-            medium_max_words: 15,
-            complex_indicators: vec![
-                "compare".to_string(),
-                "contrast".to_string(),
-                "analyze".to_string(),
-                "evaluate".to_string(),
-                "synthesize".to_string(),
-                "explain why".to_string(),
-                "how does".to_string(),
-                "what are the implications".to_string(),
-                "relationship between".to_string(),
-                "cause and effect".to_string(),
-            ],
-            simple_indicators: vec![
-                "what is".to_string(),
-                "define".to_string(),
-                "list".to_string(),
-                "who".to_string(),
-                "when".to_string(),
-                "where".to_string(),
-            ],
-        }
-    }
-}
-
 /// Query complexity detector.
 ///
-/// Analyzes queries to determine their complexity level,
-/// which influences strategy selection.
+/// Uses LLM for classification when available; falls back to heuristic rules.
 pub struct ComplexityDetector {
-    config: ComplexityConfig,
+    /// Optional LLM client for LLM-based detection.
+    llm_client: Option<crate::llm::LlmClient>,
 }
 
 impl ComplexityDetector {
-    /// Create a new complexity detector.
+    /// Create a new complexity detector (heuristic only).
     pub fn new() -> Self {
+        Self { llm_client: None }
+    }
+
+    /// Create with LLM client for accurate detection.
+    pub fn with_llm_client(client: crate::llm::LlmClient) -> Self {
         Self {
-            config: ComplexityConfig::default(),
+            llm_client: Some(client),
         }
     }
 
-    /// Create with custom configuration.
-    pub fn with_config(config: ComplexityConfig) -> Self {
-        Self { config }
+    /// Detect the complexity of a query.
+    ///
+    /// Uses LLM when available; falls back to heuristic rules.
+    pub async fn detect(&self, query: &str) -> QueryComplexity {
+        if let Some(ref client) = self.llm_client {
+            if let Some(complexity) = crate::retrieval::pilot::detect_with_llm(client, query).await
+            {
+                return complexity;
+            }
+            tracing::warn!("LLM complexity detection failed, falling back to heuristic");
+        }
+        self.detect_heuristic(query)
     }
 
-    /// Detect the complexity of a query.
-    pub fn detect(&self, query: &str) -> QueryComplexity {
+    /// Heuristic-based fallback: keyword matching + word count.
+    fn detect_heuristic(&self, query: &str) -> QueryComplexity {
         let query_lower = query.to_lowercase();
-        let word_count = query.split_whitespace().count();
+        let word_count = estimate_word_count(query);
+
+        // Complex indicators (English + Chinese)
+        let complex_indicators = [
+            "compare",
+            "contrast",
+            "analyze",
+            "evaluate",
+            "synthesize",
+            "explain why",
+            "how does",
+            "relationship between",
+            "cause and effect",
+            "对比",
+            "分析",
+            "评估",
+            "综合",
+            "为什么",
+            "原因",
+            "关系",
+            "影响",
+            "区别",
+            "异同",
+        ];
 
-        // Check for complex indicators
-        for indicator in &self.config.complex_indicators {
+        for indicator in &complex_indicators {
             if query_lower.contains(indicator) {
                 return QueryComplexity::Complex;
             }
         }
 
-        // Check for simple indicators
-        for indicator in &self.config.simple_indicators {
-            if query_lower.contains(indicator) {
-                // Simple indicator found, but check word count
-                if word_count <= self.config.medium_max_words {
-                    return QueryComplexity::Simple;
-                }
+        // Simple indicators
+        let simple_indicators = [
+            "what is",
+            "define",
+            "list",
+            "who",
+            "when",
+            "where",
+            "什么是",
+            "定义",
+            "列表",
+            "谁",
+            "何时",
+            "哪里",
+            "在哪",
+        ];
+
+        for indicator in &simple_indicators {
+            if query_lower.contains(indicator) && word_count <= 15 {
+                return QueryComplexity::Simple;
             }
         }
 
-        // Check for multiple questions
-        let question_marks = query.matches('?').count();
+        // Multiple questions
+        let question_marks = query.matches('?').count() + query.matches('？').count();
         if question_marks > 1 {
             return QueryComplexity::Complex;
         }
 
-        // Check for conjunctions suggesting multiple parts
-        let conjunctions = ["and", "or", "but", "however", "although"];
-        let conjunction_count = conjunctions
-            .iter()
-            .filter(|c| query_lower.split_whitespace().any(|w| w == **c))
-            .count();
-
-        if conjunction_count >= 2 {
-            return QueryComplexity::Complex;
-        }
-
-        // Check for nested concepts
-        let depth_indicators = ["in the context of", "with respect to", "regarding", "about"];
-        for indicator in depth_indicators {
-            if query_lower.contains(indicator) {
-                return QueryComplexity::Medium;
-            }
-        }
-
-        // Word count based classification
-        if word_count <= self.config.simple_max_words {
+        // Word count classification
+        if word_count <= 5 {
             QueryComplexity::Simple
-        } else if word_count <= self.config.medium_max_words {
+        } else if word_count <= 15 {
             QueryComplexity::Medium
         } else {
             QueryComplexity::Complex
         }
     }
 
     /// Get complexity score (0.0 - 1.0).
-    pub fn complexity_score(&self, query: &str) -> f32 {
-        match self.detect(query) {
+    pub fn complexity_score(&self, complexity: QueryComplexity) -> f32 {
+        match complexity {
             QueryComplexity::Simple => 0.2,
             QueryComplexity::Medium => 0.5,
             QueryComplexity::Complex => 0.8,
         }
     }
 
-    /// Analyze query features.
+    /// Analyze query features (heuristic only, no LLM call).
     pub fn analyze(&self, query: &str) -> QueryAnalysis {
-        let query_lower = query.to_lowercase();
         let words: Vec<&str> = query.split_whitespace().collect();
         let unique_words: HashSet<&str> = words.iter().copied().collect();
 
@@ -149,10 +139,10 @@ impl ComplexityDetector {
             } else {
                 unique_words.len() as f32 / words.len() as f32
             },
-            has_question_mark: query.contains('?'),
-            question_count: query.matches('?').count(),
-            complexity: self.detect(query),
-            complexity_score: self.complexity_score(query),
+            has_question_mark: query.contains('?') || query.contains('？'),
+            question_count: query.matches('?').count() + query.matches('？').count(),
+            complexity: self.detect_heuristic(query),
+            complexity_score: self.complexity_score(self.detect_heuristic(query)),
         }
     }
 }
@@ -163,6 +153,52 @@ impl Default for ComplexityDetector {
     }
 }
 
+/// Estimate word count, handling both CJK and Latin text.
+fn estimate_word_count(text: &str) -> usize {
+    let mut count = 0usize;
+    let mut in_latin_word = false;
+
+    for ch in text.chars() {
+        if ch.is_whitespace() {
+            if in_latin_word {
+                count += 1;
+                in_latin_word = false;
+            }
+        } else if ch.is_ascii_alphanumeric() {
+            in_latin_word = true;
+        } else if is_cjk_char(ch) {
+            if in_latin_word {
+                count += 1;
+                in_latin_word = false;
+            }
+            count += 1;
+        } else {
+            if in_latin_word {
+                count += 1;
+                in_latin_word = false;
+            }
+        }
+    }
+    if in_latin_word {
+        count += 1;
+    }
+    count
+}
+
+/// Check if a character is CJK (Chinese/Japanese/Korean).
+fn is_cjk_char(ch: char) -> bool {
+    let cp = ch as u32;
+    (0x4E00..=0x9FFF).contains(&cp)
+        || (0x3400..=0x4DBF).contains(&cp)
+        || (0x20000..=0x2A6DF).contains(&cp)
+        || (0x2A700..=0x2B73F).contains(&cp)
+        || (0xF900..=0xFAFF).contains(&cp)
+        || (0x2F800..=0x2FA1F).contains(&cp)
+        || (0x3000..=0x303F).contains(&cp)
+        || (0x3040..=0x309F).contains(&cp)
+        || (0x30A0..=0x30FF).contains(&cp)
+}
+
 /// Analysis result for a query.
 #[derive(Debug, Clone)]
 pub struct QueryAnalysis {
@@ -188,21 +224,40 @@ mod tests {
     fn test_simple_queries() {
         let detector = ComplexityDetector::new();
 
-        assert_eq!(detector.detect("What is Rust?"), QueryComplexity::Simple);
-        assert_eq!(detector.detect("Define async"), QueryComplexity::Simple);
-        assert_eq!(detector.detect("List features"), QueryComplexity::Simple);
+        assert_eq!(
+            detector.detect_heuristic("What is Rust?"),
+            QueryComplexity::Simple
+        );
+        assert_eq!(
+            detector.detect_heuristic("Define async"),
+            QueryComplexity::Simple
+        );
+        assert_eq!(
+            detector.detect_heuristic("什么是向量检索"),
+            QueryComplexity::Simple
+        );
     }
 
     #[test]
     fn test_complex_queries() {
         let detector = ComplexityDetector::new();
 
         assert_eq!(
-            detector.detect("Compare and contrast the different approaches to async programming"),
+            detector.detect_heuristic(
+                "Compare and contrast the different approaches to async programming"
+            ),
+            QueryComplexity::Complex
+        );
+        assert_eq!(
+            detector.detect_heuristic("What is the relationship between ownership and borrowing?"),
+            QueryComplexity::Complex
+        );
+        assert_eq!(
+            detector.detect_heuristic("对比A和B的区别"),
             QueryComplexity::Complex
         );
         assert_eq!(
-            detector.detect("What is the relationship between ownership and borrowing?"),
+            detector.detect_heuristic("分析索引和检索的关系"),
             QueryComplexity::Complex
         );
     }
@@ -211,8 +266,20 @@ mod tests {
     fn test_medium_queries() {
         let detector = ComplexityDetector::new();
 
-        // Medium length without complex indicators
         let medium_query = "How do I implement a simple web server with error handling?";
-        assert_eq!(detector.detect(medium_query), QueryComplexity::Medium);
+        assert_eq!(detector.detect_heuristic(medium_query), QueryComplexity::Medium);
+    }
+
+    #[test]
+    fn test_estimate_word_count() {
+        assert_eq!(estimate_word_count("hello world"), 2);
+        assert_eq!(estimate_word_count("什么是向量"), 4);
+        assert_eq!(estimate_word_count("什么是 vector search"), 4);
+    }
+
+    #[test]
+    fn test_no_llm_is_ok() {
+        let detector = ComplexityDetector::new();
+        assert!(detector.llm_client.is_none());
     }
 }