Skip to content

Commit b4d1f08

Browse files
committed
Add document processing pipeline to Indexer class
- Add optional loader and transformers to Indexer constructor - Add __invoke method to process sources through full pipeline - Update example to use enhanced Indexer directly instead of DocumentProcessor - Maintain backward compatibility with existing index() method
1 parent 242501e commit b4d1f08

File tree

2 files changed

+54
-5
lines changed

2 files changed

+54
-5
lines changed

examples/indexer/movies.php

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,6 @@
1212
use Symfony\AI\Platform\Bridge\OpenAi\Embeddings;
1313
use Symfony\AI\Platform\Bridge\OpenAi\PlatformFactory;
1414
use Symfony\AI\Store\Bridge\Local\InMemoryStore;
15-
use Symfony\AI\Store\Document\DocumentProcessor;
1615
use Symfony\AI\Store\Document\Loader\TextFileLoader;
1716
use Symfony\AI\Store\Document\Transformer\ReplaceTextTransformer;
1817
use Symfony\AI\Store\Document\Transformer\TextSplitTransformer;
@@ -23,13 +22,14 @@
2322

2423
$platform = PlatformFactory::create(env('OPENAI_API_KEY'), http_client());
2524
$store = new InMemoryStore();
26-
$processor = new DocumentProcessor(
25+
$indexer = new Indexer(
26+
new Vectorizer($platform, new Embeddings('text-embedding-3-small')),
27+
$store,
2728
new TextFileLoader(),
2829
[
2930
new ReplaceTextTransformer(search: '## Plot', replace: '## Synopsis'),
3031
new TextSplitTransformer(chunkSize: 500, overlap: 100),
31-
],
32-
new Indexer(new Vectorizer($platform, new Embeddings('text-embedding-3-small')), $store)
32+
]
3333
);
3434

3535
$movies = [
@@ -38,7 +38,7 @@
3838
dirname(__DIR__, 2).'/fixtures/movies/jurassic-park.md',
3939
];
4040

41-
$processor->process($movies);
41+
$indexer($movies);
4242

4343
$vector = $platform->invoke(new Embeddings('text-embedding-3-small'), 'Roman gladiator revenge')->asVectors()[0];
4444
$results = $store->search($vector, 2);

src/store/src/Indexer.php

Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,17 +13,24 @@
1313

1414
use Psr\Log\LoggerInterface;
1515
use Psr\Log\NullLogger;
16+
use Symfony\AI\Store\Document\LoaderInterface;
1617
use Symfony\AI\Store\Document\TextDocument;
18+
use Symfony\AI\Store\Document\TransformerInterface;
1719
use Symfony\AI\Store\Document\VectorizerInterface;
1820

1921
/**
2022
* @author Christopher Hertel <mail@christopher-hertel.de>
2123
*/
2224
final readonly class Indexer implements IndexerInterface
2325
{
26+
/**
27+
* @param TransformerInterface[] $transformers
28+
*/
2429
public function __construct(
2530
private VectorizerInterface $vectorizer,
2631
private StoreInterface $store,
32+
private ?LoaderInterface $loader = null,
33+
private array $transformers = [],
2734
private LoggerInterface $logger = new NullLogger(),
2835
) {
2936
}
@@ -56,4 +63,46 @@ public function index(TextDocument|iterable $documents, int $chunkSize = 50): vo
5663

5764
$this->logger->debug(0 === $counter ? 'No documents to index' : \sprintf('Indexed %d documents', $counter));
5865
}
66+
67+
/**
68+
* Process sources through the complete document pipeline: load → transform → vectorize → store.
69+
*
70+
* @param string|array<string> $source Source identifier (file path, URL, etc.) or array of sources
71+
* @param array<string, mixed> $options Processing options
72+
*/
73+
public function __invoke(string|array $source, array $options = []): void
74+
{
75+
if (null === $this->loader) {
76+
throw new \LogicException('Cannot process sources without a loader. Either provide documents directly to index() or configure a loader in the constructor.');
77+
}
78+
79+
$this->logger->debug('Starting document processing', [
80+
'source' => $source,
81+
'options' => $options,
82+
]);
83+
84+
$sources = (array) $source;
85+
$allDocuments = [];
86+
87+
// Load documents from all sources
88+
foreach ($sources as $singleSource) {
89+
$documents = ($this->loader)($singleSource, $options['loader'] ?? []);
90+
foreach ($documents as $document) {
91+
$allDocuments[] = $document;
92+
}
93+
}
94+
95+
// Transform documents through all transformers
96+
$transformedDocuments = $allDocuments;
97+
foreach ($this->transformers as $transformer) {
98+
$transformedDocuments = ($transformer)($transformedDocuments, $options['transformer'] ?? []);
99+
}
100+
101+
// Vectorize and store documents
102+
$this->index($transformedDocuments, $options['chunk_size'] ?? 50);
103+
104+
$this->logger->debug('Document processing completed', [
105+
'source' => $source,
106+
]);
107+
}
59108
}

0 commit comments

Comments
 (0)