|
| 1 | +<?php |
| 2 | + |
| 3 | +/* |
| 4 | + * This file is part of the Symfony package. |
| 5 | + * |
| 6 | + * (c) Fabien Potencier <fabien@symfony.com> |
| 7 | + * |
| 8 | + * For the full copyright and license information, please view the LICENSE |
| 9 | + * file that was distributed with this source code. |
| 10 | + */ |
| 11 | + |
| 12 | +namespace Symfony\AI\Store\Bridge\Postgres; |
| 13 | + |
| 14 | +use Symfony\AI\Platform\Vector\Vector; |
| 15 | +use Symfony\AI\Platform\Vector\VectorInterface; |
| 16 | +use Symfony\AI\Store\Document\Metadata; |
| 17 | +use Symfony\AI\Store\Document\VectorDocument; |
| 18 | +use Symfony\AI\Store\Exception\InvalidArgumentException; |
| 19 | +use Symfony\AI\Store\ManagedStoreInterface; |
| 20 | +use Symfony\AI\Store\StoreInterface; |
| 21 | +use Symfony\Component\Uid\Uuid; |
| 22 | + |
| 23 | +/** |
| 24 | + * Hybrid Search Store for PostgreSQL/Supabase |
| 25 | + * Combines pgvector (semantic) + PostgreSQL Full-Text Search (ts_rank_cd) using RRF. |
| 26 | + * |
| 27 | + * Uses Reciprocal Rank Fusion (RRF) to combine vector similarity and full-text search, |
| 28 | + * following the same approach as Supabase hybrid search implementation. |
| 29 | + * |
| 30 | + * Requirements: |
| 31 | + * - PostgreSQL with pgvector extension |
| 32 | + * - A 'content' text field for full-text search |
| 33 | + * |
| 34 | + * @see https://supabase.com/docs/guides/ai/hybrid-search |
| 35 | + * |
| 36 | + * @author Ahmed EBEN HASSINE <ahmedbhs123@æmail.com> |
| 37 | + */ |
| 38 | +final readonly class PostgresHybridStore implements ManagedStoreInterface, StoreInterface |
| 39 | +{ |
| 40 | + /** |
| 41 | + * @param string $vectorFieldName Name of the vector field |
| 42 | + * @param string $contentFieldName Name of the text field for FTS |
| 43 | + * @param float $semanticRatio Ratio between semantic (vector) and keyword (FTS) search (0.0 to 1.0) |
| 44 | + * - 0.0 = 100% keyword search (FTS) |
| 45 | + * - 0.5 = balanced hybrid search |
| 46 | + * - 1.0 = 100% semantic search (vector only) - default |
| 47 | + * @param Distance $distance Distance metric for vector similarity |
| 48 | + * @param string $language PostgreSQL text search configuration (default: 'simple') |
| 49 | + * - 'simple': Works for ALL languages, no stemming (recommended for multilingual content) |
| 50 | + * - 'english', 'french', 'spanish', etc.: Language-specific stemming/stopwords |
| 51 | + * @param int $rrfK RRF (Reciprocal Rank Fusion) constant for hybrid search (default: 60) |
| 52 | + * Higher values = more equal weighting between results |
| 53 | + * @param float|null $defaultMaxScore Default maximum distance threshold for vector search (default: null = no filter) |
| 54 | + * Only applies to pure vector search (semanticRatio = 1.0) |
| 55 | + * Prevents returning irrelevant results with high distance scores |
| 56 | + * Example: 0.8 means only return documents with distance < 0.8 |
| 57 | + */ |
| 58 | + public function __construct( |
| 59 | + private \PDO $connection, |
| 60 | + private string $tableName, |
| 61 | + private string $vectorFieldName = 'embedding', |
| 62 | + private string $contentFieldName = 'content', |
| 63 | + private float $semanticRatio = 1.0, |
| 64 | + private Distance $distance = Distance::L2, |
| 65 | + private string $language = 'simple', |
| 66 | + private int $rrfK = 60, |
| 67 | + private ?float $defaultMaxScore = null, |
| 68 | + ) { |
| 69 | + if ($semanticRatio < 0.0 || $semanticRatio > 1.0) { |
| 70 | + throw new InvalidArgumentException(\sprintf('The semantic ratio must be between 0.0 and 1.0, "%s" given.', $semanticRatio)); |
| 71 | + } |
| 72 | + } |
| 73 | + |
| 74 | + public function setup(array $options = []): void |
| 75 | + { |
| 76 | + // Enable pgvector extension |
| 77 | + $this->connection->exec('CREATE EXTENSION IF NOT EXISTS vector'); |
| 78 | + |
| 79 | + // Create table with vector field, content field for FTS, and tsvector field |
| 80 | + $this->connection->exec( |
| 81 | + \sprintf( |
| 82 | + 'CREATE TABLE IF NOT EXISTS %s ( |
| 83 | + id UUID PRIMARY KEY, |
| 84 | + metadata JSONB, |
| 85 | + %s TEXT NOT NULL, |
| 86 | + %s %s(%d) NOT NULL, |
| 87 | + content_tsv tsvector GENERATED ALWAYS AS (to_tsvector(\'%s\', %s)) STORED |
| 88 | + )', |
| 89 | + $this->tableName, |
| 90 | + $this->contentFieldName, |
| 91 | + $this->vectorFieldName, |
| 92 | + $options['vector_type'] ?? 'vector', |
| 93 | + $options['vector_size'] ?? 1536, |
| 94 | + $this->language, |
| 95 | + $this->contentFieldName, |
| 96 | + ), |
| 97 | + ); |
| 98 | + |
| 99 | + // Create vector index |
| 100 | + $this->connection->exec( |
| 101 | + \sprintf( |
| 102 | + 'CREATE INDEX IF NOT EXISTS %s_%s_idx ON %s USING %s (%s %s)', |
| 103 | + $this->tableName, |
| 104 | + $this->vectorFieldName, |
| 105 | + $this->tableName, |
| 106 | + $options['index_method'] ?? 'ivfflat', |
| 107 | + $this->vectorFieldName, |
| 108 | + $options['index_opclass'] ?? 'vector_cosine_ops', |
| 109 | + ), |
| 110 | + ); |
| 111 | + |
| 112 | + // Create GIN index for full-text search |
| 113 | + $this->connection->exec( |
| 114 | + \sprintf( |
| 115 | + 'CREATE INDEX IF NOT EXISTS %s_content_tsv_idx ON %s USING gin(content_tsv)', |
| 116 | + $this->tableName, |
| 117 | + $this->tableName, |
| 118 | + ), |
| 119 | + ); |
| 120 | + } |
| 121 | + |
| 122 | + public function drop(): void |
| 123 | + { |
| 124 | + $this->connection->exec(\sprintf('DROP TABLE IF EXISTS %s', $this->tableName)); |
| 125 | + } |
| 126 | + |
| 127 | + public function add(VectorDocument ...$documents): void |
| 128 | + { |
| 129 | + $statement = $this->connection->prepare( |
| 130 | + \sprintf( |
| 131 | + 'INSERT INTO %1$s (id, metadata, %2$s, %3$s) |
| 132 | + VALUES (:id, :metadata, :content, :vector) |
| 133 | + ON CONFLICT (id) DO UPDATE SET |
| 134 | + metadata = EXCLUDED.metadata, |
| 135 | + %2$s = EXCLUDED.%2$s, |
| 136 | + %3$s = EXCLUDED.%3$s', |
| 137 | + $this->tableName, |
| 138 | + $this->contentFieldName, |
| 139 | + $this->vectorFieldName, |
| 140 | + ), |
| 141 | + ); |
| 142 | + |
| 143 | + foreach ($documents as $document) { |
| 144 | + $operation = [ |
| 145 | + 'id' => $document->id->toRfc4122(), |
| 146 | + 'metadata' => json_encode($document->metadata->getArrayCopy(), \JSON_THROW_ON_ERROR), |
| 147 | + 'content' => $document->metadata->getText() ?? '', |
| 148 | + 'vector' => $this->toPgvector($document->vector), |
| 149 | + ]; |
| 150 | + |
| 151 | + $statement->execute($operation); |
| 152 | + } |
| 153 | + } |
| 154 | + |
| 155 | + /** |
| 156 | + * Hybrid search combining vector similarity and full-text search. |
| 157 | + * |
| 158 | + * @param array{ |
| 159 | + * q?: string, |
| 160 | + * semanticRatio?: float, |
| 161 | + * limit?: int, |
| 162 | + * where?: string, |
| 163 | + * params?: array, |
| 164 | + * maxScore?: float |
| 165 | + * } $options |
| 166 | + */ |
| 167 | + public function query(Vector $vector, array $options = []): array |
| 168 | + { |
| 169 | + $semanticRatio = $options['semanticRatio'] ?? $this->semanticRatio; |
| 170 | + |
| 171 | + if ($semanticRatio < 0.0 || $semanticRatio > 1.0) { |
| 172 | + throw new InvalidArgumentException(\sprintf('The semantic ratio must be between 0.0 and 1.0, "%s" given.', $semanticRatio)); |
| 173 | + } |
| 174 | + |
| 175 | + $queryText = $options['q'] ?? ''; |
| 176 | + $limit = $options['limit'] ?? 5; |
| 177 | + |
| 178 | + // Build WHERE clause |
| 179 | + $where = []; |
| 180 | + $params = []; |
| 181 | + |
| 182 | + // Only add embedding param if we're doing vector search |
| 183 | + if ($semanticRatio > 0.0) { |
| 184 | + $params['embedding'] = $this->toPgvector($vector); |
| 185 | + } |
| 186 | + |
| 187 | + // Use maxScore from options, or defaultMaxScore if configured |
| 188 | + $maxScore = $options['maxScore'] ?? $this->defaultMaxScore; |
| 189 | + |
| 190 | + if (null !== $maxScore) { |
| 191 | + $where[] = "({$this->vectorFieldName} {$this->distance->getComparisonSign()} :embedding) <= :maxScore"; |
| 192 | + $params['maxScore'] = $maxScore; |
| 193 | + // Ensure embedding is available if maxScore is used |
| 194 | + if (!isset($params['embedding'])) { |
| 195 | + $params['embedding'] = $this->toPgvector($vector); |
| 196 | + } |
| 197 | + } |
| 198 | + |
| 199 | + if ($options['where'] ?? false) { |
| 200 | + $where[] = '('.$options['where'].')'; |
| 201 | + } |
| 202 | + |
| 203 | + $whereClause = $where ? 'WHERE '.implode(' AND ', $where) : ''; |
| 204 | + |
| 205 | + // Choose query strategy based on semanticRatio and query text |
| 206 | + if (1.0 === $semanticRatio || empty($queryText)) { |
| 207 | + // Pure vector search |
| 208 | + $sql = $this->buildVectorOnlyQuery($whereClause, $limit); |
| 209 | + } elseif (0.0 === $semanticRatio) { |
| 210 | + // Pure full-text search |
| 211 | + $sql = $this->buildFtsOnlyQuery($whereClause, $limit); |
| 212 | + $params['query'] = $queryText; |
| 213 | + } else { |
| 214 | + // Hybrid search with weighted combination |
| 215 | + $sql = $this->buildHybridQuery($whereClause, $limit, $semanticRatio); |
| 216 | + $params['query'] = $queryText; |
| 217 | + } |
| 218 | + |
| 219 | + $statement = $this->connection->prepare($sql); |
| 220 | + $statement->execute([...$params, ...($options['params'] ?? [])]); |
| 221 | + |
| 222 | + $documents = []; |
| 223 | + foreach ($statement->fetchAll(\PDO::FETCH_ASSOC) as $result) { |
| 224 | + $documents[] = new VectorDocument( |
| 225 | + id: Uuid::fromString($result['id']), |
| 226 | + vector: new Vector($this->fromPgvector($result['embedding'])), |
| 227 | + metadata: new Metadata(json_decode($result['metadata'] ?? '{}', true, 512, \JSON_THROW_ON_ERROR)), |
| 228 | + score: $result['score'], |
| 229 | + ); |
| 230 | + } |
| 231 | + |
| 232 | + return $documents; |
| 233 | + } |
| 234 | + |
| 235 | + private function buildVectorOnlyQuery(string $whereClause, int $limit): string |
| 236 | + { |
| 237 | + return \sprintf(<<<SQL |
| 238 | + SELECT id, %s AS embedding, metadata, (%s %s :embedding) AS score |
| 239 | + FROM %s |
| 240 | + %s |
| 241 | + ORDER BY score ASC |
| 242 | + LIMIT %d |
| 243 | + SQL, |
| 244 | + $this->vectorFieldName, |
| 245 | + $this->vectorFieldName, |
| 246 | + $this->distance->getComparisonSign(), |
| 247 | + $this->tableName, |
| 248 | + $whereClause, |
| 249 | + $limit, |
| 250 | + ); |
| 251 | + } |
| 252 | + |
| 253 | + private function buildFtsOnlyQuery(string $whereClause, int $limit): string |
| 254 | + { |
| 255 | + // Add FTS match filter to ensure only relevant documents are returned |
| 256 | + $ftsFilter = \sprintf("content_tsv @@ websearch_to_tsquery('%s', :query)", $this->language); |
| 257 | + |
| 258 | + if ($whereClause) { |
| 259 | + // Combine existing WHERE clause with FTS filter |
| 260 | + $whereClause = str_replace('WHERE ', "WHERE $ftsFilter AND ", $whereClause); |
| 261 | + } else { |
| 262 | + $whereClause = "WHERE $ftsFilter"; |
| 263 | + } |
| 264 | + |
| 265 | + return \sprintf(<<<SQL |
| 266 | + SELECT id, %s AS embedding, metadata, |
| 267 | + (1.0 / (1.0 + ts_rank_cd(content_tsv, websearch_to_tsquery('%s', :query)))) AS score |
| 268 | + FROM %s |
| 269 | + %s |
| 270 | + ORDER BY score ASC |
| 271 | + LIMIT %d |
| 272 | + SQL, |
| 273 | + $this->vectorFieldName, |
| 274 | + $this->language, |
| 275 | + $this->tableName, |
| 276 | + $whereClause, |
| 277 | + $limit, |
| 278 | + ); |
| 279 | + } |
| 280 | + |
| 281 | + private function buildHybridQuery(string $whereClause, int $limit, float $semanticRatio): string |
| 282 | + { |
| 283 | + // Add FTS filter for the fts_scores CTE |
| 284 | + $ftsWhereClause = $whereClause; |
| 285 | + $ftsFilter = \sprintf("content_tsv @@ websearch_to_tsquery('%s', :query)", $this->language); |
| 286 | + |
| 287 | + if ($whereClause) { |
| 288 | + $ftsWhereClause = str_replace('WHERE ', "WHERE $ftsFilter AND ", $whereClause); |
| 289 | + } else { |
| 290 | + $ftsWhereClause = "WHERE $ftsFilter"; |
| 291 | + } |
| 292 | + |
| 293 | + // RRF (Reciprocal Rank Fusion) - Same approach as Supabase |
| 294 | + // Formula: COALESCE(1.0 / (k + rank), 0.0) * weight |
| 295 | + // Lower score is better (like distance) |
| 296 | + return \sprintf(<<<SQL |
| 297 | + WITH vector_scores AS ( |
| 298 | + SELECT id, %s AS embedding, metadata, |
| 299 | + ROW_NUMBER() OVER (ORDER BY %s %s :embedding) AS rank_ix |
| 300 | + FROM %s |
| 301 | + %s |
| 302 | + ), |
| 303 | + fts_scores AS ( |
| 304 | + SELECT id, |
| 305 | + ROW_NUMBER() OVER (ORDER BY ts_rank_cd(content_tsv, websearch_to_tsquery('%s', :query)) DESC) AS rank_ix |
| 306 | + FROM %s |
| 307 | + %s |
| 308 | + ) |
| 309 | + SELECT v.id, v.embedding, v.metadata, |
| 310 | + ( |
| 311 | + COALESCE(1.0 / (%d + v.rank_ix), 0.0) * %f + |
| 312 | + COALESCE(1.0 / (%d + f.rank_ix), 0.0) * %f |
| 313 | + ) AS score |
| 314 | + FROM vector_scores v |
| 315 | + FULL OUTER JOIN fts_scores f ON v.id = f.id |
| 316 | + WHERE v.id IS NOT NULL OR f.id IS NOT NULL |
| 317 | + ORDER BY score DESC |
| 318 | + LIMIT %d |
| 319 | + SQL, |
| 320 | + $this->vectorFieldName, |
| 321 | + $this->vectorFieldName, |
| 322 | + $this->distance->getComparisonSign(), |
| 323 | + $this->tableName, |
| 324 | + $whereClause, |
| 325 | + $this->language, |
| 326 | + $this->tableName, |
| 327 | + $ftsWhereClause, |
| 328 | + $this->rrfK, |
| 329 | + $semanticRatio, |
| 330 | + $this->rrfK, |
| 331 | + 1.0 - $semanticRatio, |
| 332 | + $limit, |
| 333 | + ); |
| 334 | + } |
| 335 | + |
| 336 | + private function toPgvector(VectorInterface $vector): string |
| 337 | + { |
| 338 | + return '['.implode(',', $vector->getData()).']'; |
| 339 | + } |
| 340 | + |
| 341 | + /** |
| 342 | + * @return float[] |
| 343 | + */ |
| 344 | + private function fromPgvector(string $vector): array |
| 345 | + { |
| 346 | + return json_decode($vector, true, 512, \JSON_THROW_ON_ERROR); |
| 347 | + } |
| 348 | +} |
0 commit comments