parquet init

pandas-dev · jreback · Apr 26, 2020 · Apr 18, 2020 · Apr 18, 2020 · Apr 18, 2020
commit aa94fe70430e722b92128f1a23f7dbac93030cbb
diff --git a/pandas/io/common.py b/pandas/io/common.py
@@ -158,6 +158,16 @@ def urlopen(*args, **kwargs):
     return urllib.request.urlopen(*args, **kwargs)
 
 
+def get_fs_for_path(filepath):
+    if is_s3_url(filepath):
+        from pandas.io import s3
+        return s3.get_fs()
+    elif is_gcs_url(filepath):
+        from pandas.io import gcs
+        return gcs.get_fs()
+    else:
+        return None
+
 def get_filepath_or_buffer(
     filepath_or_buffer: FilePathOrBuffer,
     encoding: Optional[str] = None,
@@ -192,7 +202,7 @@ def get_filepath_or_buffer(
             compression = "gzip"
         reader = BytesIO(req.read())
         req.close()
-        return reader, encoding, compression, True
+        return reader, encoding, compression, True, None
 
     if is_s3_url(filepath_or_buffer):
         from pandas.io import s3

diff --git a/pandas/io/gcs.py b/pandas/io/gcs.py
@@ -6,13 +6,17 @@
 )
 
 
+def get_fs():
+    return gcsfs.GCSFileSystem()
+
+
 def get_filepath_or_buffer(
     filepath_or_buffer, encoding=None, compression=None, mode=None
 ):
 
     if mode is None:
         mode = "rb"
 
-    fs = gcsfs.GCSFileSystem()
+    fs = get_fs()
     filepath_or_buffer = fs.open(filepath_or_buffer, mode)
     return filepath_or_buffer, None, compression, True
diff --git a/pandas/io/parquet.py b/pandas/io/parquet.py
@@ -8,7 +8,7 @@
 
 from pandas import DataFrame, get_option
 
-from pandas.io.common import get_filepath_or_buffer, is_gcs_url, is_s3_url
+from pandas.io.common import get_filepath_or_buffer, is_gcs_url, is_s3_url, get_fs_for_path
 
 
 def get_engine(engine: str) -> "BaseImpl":
@@ -111,14 +111,15 @@ def write(
             self.api.parquet.write_table(table, path, compression=compression, **kwargs)
 
     def read(self, path, columns=None, **kwargs):
-        path, _, _, should_close = get_filepath_or_buffer(path)
-
+        filepath, _, _, should_close = get_filepath_or_buffer(path)
+        print("here")
+        print(path)
+        parquet_ds = self.api.parquet.ParquetDataset(path, filesystem=get_fs_for_path(path), **kwargs)
         kwargs["use_pandas_metadata"] = True
-        result = self.api.parquet.read_table(
-            path, columns=columns, **kwargs
-        ).to_pandas()
+        result = parquet_ds.read(columns=columns, **kwargs).to_pandas()
+
         if should_close:
-            path.close()
+            filepath.close()
 
         return result
 

diff --git a/pandas/io/s3.py b/pandas/io/s3.py
@@ -16,6 +16,10 @@ def _strip_schema(url):
     return result.netloc + result.path
 
 
+def get_fs():
+    return s3fs.S3FileSystem(anon=False)
+
+
 def get_file_and_filesystem(
     filepath_or_buffer: FilePathOrBuffer, mode: Optional[str] = None
 ) -> Tuple[IO, Any]:
@@ -24,7 +28,7 @@ def get_file_and_filesystem(
     if mode is None:
         mode = "rb"
 
-    fs = s3fs.S3FileSystem(anon=False)
+    fs = get_fs()
     try:
         file = fs.open(_strip_schema(filepath_or_buffer), mode)
     except (FileNotFoundError, NoCredentialsError):
@@ -34,7 +38,7 @@ def get_file_and_filesystem(
         # aren't valid for that bucket.
         # A NoCredentialsError is raised if you don't have creds
         # for that bucket.
-        fs = s3fs.S3FileSystem(anon=True)
+        fs = get_fs()
         file = fs.open(_strip_schema(filepath_or_buffer), mode)
     return file, fs