Skip to content

Commit 3b430da

Browse files
committed
feat: Added proxy support for mediaWiki!
1 parent ca0e28c commit 3b430da

File tree

4 files changed

+17
-2
lines changed

4 files changed

+17
-2
lines changed

.env.example

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,3 +27,5 @@ POSTGRES_DB_PORT=
2727
POSTGRES_PWD=
2828
POSTGRES_SEEDS=
2929
POSTGRES_USER=
30+
31+
MEDIAWIKI_PROXY_URL=

hivemind_etl/mediawiki/etl.py

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
import logging
2+
import os
23
import shutil
34

45
from llama_index.core import Document
@@ -15,7 +16,14 @@ def __init__(
1516
delete_dump_after_load: bool = True,
1617
) -> None:
1718
self.community_id = community_id
18-
self.wikiteam_crawler = WikiteamCrawler(community_id, namespaces=namespaces)
19+
20+
self.proxy_url = os.getenv("MEDIAWIKI_PROXY_URL", "")
21+
if self.proxy_url:
22+
logging.info(f"Proxy is set to be used!")
23+
24+
self.wikiteam_crawler = WikiteamCrawler(
25+
community_id, namespaces=namespaces, proxy_url=self.proxy_url
26+
)
1927

2028
self.dump_dir = f"dump_{self.community_id}"
2129
self.delete_dump_after_load = delete_dump_after_load

hivemind_etl/mediawiki/wikiteam_crawler.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@ def __init__(
1212
force: bool = True,
1313
curonly: bool = True,
1414
namespaces: list[int] = [],
15+
proxy_url: str = "",
1516
**kwargs,
1617
) -> None:
1718
self.community_id = community_id
@@ -20,6 +21,7 @@ def __init__(
2021
self.curonly = curonly
2122
self.extra_params = kwargs
2223
self.namespaces = namespaces
24+
self.proxy_url = proxy_url
2325

2426
def crawl(self, api_url: str, dump_path: str) -> None:
2527
"""
@@ -50,6 +52,9 @@ def crawl(self, api_url: str, dump_path: str) -> None:
5052
if self.namespaces:
5153
params.append(f"--namespaces")
5254
params.append(f"{','.join(map(str, self.namespaces))}")
55+
if self.proxy_url:
56+
params.append(f"--proxy")
57+
params.append(self.proxy_url)
5358

5459
# Add any extra parameters passed during initialization
5560
for key, value in self.extra_params.items():

requirements.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,4 +8,4 @@ defusedxml==0.7.1
88
pydantic==2.9.2
99
motor>=3.6, <4.0.0
1010
tc-temporal-backend==1.0.0
11-
wikiteam3==4.4.1
11+
wikiteam3-fork-proxy==1.0.0

0 commit comments

Comments
 (0)