Skip to content

Commit 62ee10f

Browse files
committed
feat: add spacy model download and some logs!
1 parent af15793 commit 62ee10f

File tree

2 files changed

+4
-0
lines changed

2 files changed

+4
-0
lines changed

Dockerfile

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@ FROM python:3.11-bullseye AS base
22
WORKDIR /project
33
COPY . .
44
RUN pip install --no-cache-dir -r requirements.txt
5+
RUN python -m spacy download en_core_web_sm
56

67
FROM base AS test
78
RUN chmod +x docker-entrypoint.sh

hivemind_etl/mediawiki/etl.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -94,10 +94,13 @@ def transform(self) -> list[Document]:
9494
return documents
9595

9696
def load(self, documents: list[Document]) -> None:
97+
logging.info(f"Loading {len(documents)} documents into Qdrant!")
9798
ingestion_pipeline = CustomIngestionPipeline(
9899
self.community_id, collection_name="mediawiki"
99100
)
100101
ingestion_pipeline.run_pipeline(documents)
102+
logging.info(f"Loaded {len(documents)} documents into Qdrant!")
101103

102104
if self.delete_dump_after_load:
105+
logging.info(f"Removing dump directory {self.dump_dir}!")
103106
shutil.rmtree(self.dump_dir)

0 commit comments

Comments
 (0)