|
6 | 6 | import json
|
7 | 7 | import pandas
|
8 | 8 | import pathlib
|
9 |
| -from pinecone.grpc import PineconeGRPC |
10 | 9 | import pyarrow.dataset as ds
|
11 | 10 | from pyarrow.parquet import ParquetDataset, ParquetFile
|
12 | 11 |
|
|
16 | 15 | from vsb import logger
|
17 | 16 | from vsb.logging import ProgressIOWrapper
|
18 | 17 |
|
| 18 | +# remove |
| 19 | +import gevent.monkey |
| 20 | + |
19 | 21 |
|
20 | 22 | class Dataset:
|
21 | 23 | """
|
@@ -168,13 +170,30 @@ def setup_queries(self, query_limit=0):
|
168 | 170 | )
|
169 | 171 |
|
170 | 172 | def _download_dataset_files(self):
|
| 173 | + # Unpatch all gevent monkeypatched modules; we use google cloud |
| 174 | + # python libraries which will try to call stuff like socket and |
| 175 | + # wait, and if they're monkeypatched, they'll fail with a LoopExit |
| 176 | + # because the OS thread it runs in has no hub. |
| 177 | + |
| 178 | + # https://github.com/gevent/gevent/issues/1350#issuecomment-478630812 |
| 179 | + |
| 180 | + # Note that this does mean that this function will block in a non- |
| 181 | + # gevent-friendly way. Ensure that it's called in a threadpool, or |
| 182 | + # you may get heartbeat failures in distributed mode. |
| 183 | + import threading |
| 184 | + from importlib import reload |
| 185 | + |
| 186 | + reload(threading) |
171 | 187 | with FileLock(self.cache / ".lock"):
|
172 | 188 | self.cache.mkdir(parents=True, exist_ok=True)
|
173 | 189 | logger.debug(
|
174 | 190 | f"Checking for existence of dataset '{self.name}' in dataset cache '{self.cache}'"
|
175 | 191 | )
|
176 | 192 | client = Client.create_anonymous_client()
|
177 | 193 | bucket: Bucket = client.bucket(Dataset.gcs_bucket)
|
| 194 | + logger.debug( |
| 195 | + f"_download_dataset_files(): threading={gevent.monkey.is_module_patched('threading')}" |
| 196 | + ) |
178 | 197 | blobs = [b for b in bucket.list_blobs(prefix=self.name + "/")]
|
179 | 198 | # Ignore directories (blobs ending in '/') as we don't explicilty need them
|
180 | 199 | # (non-empty directories will have their files downloaded
|
@@ -230,6 +249,8 @@ def should_download(blob):
|
230 | 249 | # Clear the progress bar now we're done.
|
231 | 250 | vsb.progress.stop()
|
232 | 251 | vsb.progress = None
|
| 252 | + # Re-apply gevent monkeypatching. |
| 253 | + gevent.monkey.patch_all() |
233 | 254 |
|
234 | 255 | def _load_parquet_dataset(self, kind, limit=0):
|
235 | 256 | parquet_files = [f for f in (self.cache / self.name).glob(kind + "/*.parquet")]
|
|
0 commit comments