Skip to content

Commit e3b50c0

Browse files
committed
Unpatch threading for dataset downloading
1 parent 96c1fe1 commit e3b50c0

File tree

1 file changed

+22
-1
lines changed

1 file changed

+22
-1
lines changed

vsb/workloads/dataset.py

Lines changed: 22 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,6 @@
66
import json
77
import pandas
88
import pathlib
9-
from pinecone.grpc import PineconeGRPC
109
import pyarrow.dataset as ds
1110
from pyarrow.parquet import ParquetDataset, ParquetFile
1211

@@ -16,6 +15,9 @@
1615
from vsb import logger
1716
from vsb.logging import ProgressIOWrapper
1817

18+
# remove
19+
import gevent.monkey
20+
1921

2022
class Dataset:
2123
"""
@@ -168,13 +170,30 @@ def setup_queries(self, query_limit=0):
168170
)
169171

170172
def _download_dataset_files(self):
173+
# Unpatch all gevent monkeypatched modules; we use google cloud
174+
# python libraries which will try to call stuff like socket and
175+
# wait, and if they're monkeypatched, they'll fail with a LoopExit
176+
# because the OS thread it runs in has no hub.
177+
178+
# https://github.com/gevent/gevent/issues/1350#issuecomment-478630812
179+
180+
# Note that this does mean that this function will block in a non-
181+
# gevent-friendly way. Ensure that it's called in a threadpool, or
182+
# you may get heartbeat failures in distributed mode.
183+
import threading
184+
from importlib import reload
185+
186+
reload(threading)
171187
with FileLock(self.cache / ".lock"):
172188
self.cache.mkdir(parents=True, exist_ok=True)
173189
logger.debug(
174190
f"Checking for existence of dataset '{self.name}' in dataset cache '{self.cache}'"
175191
)
176192
client = Client.create_anonymous_client()
177193
bucket: Bucket = client.bucket(Dataset.gcs_bucket)
194+
logger.debug(
195+
f"_download_dataset_files(): threading={gevent.monkey.is_module_patched('threading')}"
196+
)
178197
blobs = [b for b in bucket.list_blobs(prefix=self.name + "/")]
179198
# Ignore directories (blobs ending in '/') as we don't explicilty need them
180199
# (non-empty directories will have their files downloaded
@@ -230,6 +249,8 @@ def should_download(blob):
230249
# Clear the progress bar now we're done.
231250
vsb.progress.stop()
232251
vsb.progress = None
252+
# Re-apply gevent monkeypatching.
253+
gevent.monkey.patch_all()
233254

234255
def _load_parquet_dataset(self, kind, limit=0):
235256
parquet_files = [f for f in (self.cache / self.name).glob(kind + "/*.parquet")]

0 commit comments

Comments
 (0)