Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
27 commits
Select commit Hold shift + click to select a range
aa94fe7
parquet init
alimcmaster1 Apr 18, 2020
a30c71a
Doc Str
alimcmaster1 Apr 18, 2020
b2747eb
Simplify read
alimcmaster1 Apr 18, 2020
a51757a
Fix writer with partition
alimcmaster1 Apr 18, 2020
968f3b6
Test case
alimcmaster1 Apr 18, 2020
789f4ca
Clean up test case
alimcmaster1 Apr 18, 2020
040763e
Add whatsnew
alimcmaster1 Apr 18, 2020
40f5889
Clean ups
alimcmaster1 Apr 18, 2020
753d647
Clean ups
alimcmaster1 Apr 18, 2020
e4dcdc3
Update whatsnew
alimcmaster1 Apr 18, 2020
bb21431
Add skip if no
alimcmaster1 Apr 18, 2020
fb38932
Fix import
alimcmaster1 Apr 18, 2020
c29befd
Removed fixed xfail
alimcmaster1 Apr 18, 2020
4f78fc5
remove import
alimcmaster1 Apr 18, 2020
4b2828b
Merge master
alimcmaster1 Apr 21, 2020
463c2ea
Merge remote-tracking branch 'upstream/master' into mcmali-parquet
alimcmaster1 Apr 21, 2020
dabfe58
Add further test case
alimcmaster1 Apr 21, 2020
dea95f3
Update parquet.py
alimcmaster1 Apr 25, 2020
ae76e42
Update parquet.py
alimcmaster1 Apr 25, 2020
4b48326
Add whatsnew 2
alimcmaster1 Apr 26, 2020
211c36e
Rename var
alimcmaster1 Apr 26, 2020
4897a32
Improve get_fs_for_path docstring
alimcmaster1 Apr 26, 2020
bba4040
Merge remote-tracking branch 'origin/mcmali-parquet' into mcmali-parquet
alimcmaster1 Apr 26, 2020
5bc6327
Add doc example
alimcmaster1 Apr 26, 2020
ca89c21
Make whatsnew clearer
alimcmaster1 Apr 26, 2020
0df818e
Merge remote-tracking branch 'upstream/master' into mcmali-parquet
alimcmaster1 Apr 26, 2020
2a1a85c
Merge remote-tracking branch 'upstream/master' into mcmali-parquet
alimcmaster1 Apr 26, 2020
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Next Next commit
parquet init
  • Loading branch information
alimcmaster1 committed Apr 18, 2020
commit aa94fe70430e722b92128f1a23f7dbac93030cbb
12 changes: 11 additions & 1 deletion pandas/io/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -158,6 +158,16 @@ def urlopen(*args, **kwargs):
return urllib.request.urlopen(*args, **kwargs)


def get_fs_for_path(filepath):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can you type this (and the return annotation)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Left return type for now since it include optional dependencies.

e.g Union[s3fs.S3FileSystem, gcsfs.GCSFileSystem, None]

Can add imports to the TYPE_CHECKING block at the top if that's appropriate?

if is_s3_url(filepath):
from pandas.io import s3
return s3.get_fs()
elif is_gcs_url(filepath):
from pandas.io import gcs
return gcs.get_fs()
else:
return None

def get_filepath_or_buffer(
filepath_or_buffer: FilePathOrBuffer,
encoding: Optional[str] = None,
Expand Down Expand Up @@ -192,7 +202,7 @@ def get_filepath_or_buffer(
compression = "gzip"
reader = BytesIO(req.read())
req.close()
return reader, encoding, compression, True
return reader, encoding, compression, True, None

if is_s3_url(filepath_or_buffer):
from pandas.io import s3
Expand Down
6 changes: 5 additions & 1 deletion pandas/io/gcs.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,13 +6,17 @@
)


def get_fs():
return gcsfs.GCSFileSystem()


def get_filepath_or_buffer(
filepath_or_buffer, encoding=None, compression=None, mode=None
):

if mode is None:
mode = "rb"

fs = gcsfs.GCSFileSystem()
fs = get_fs()
filepath_or_buffer = fs.open(filepath_or_buffer, mode)
return filepath_or_buffer, None, compression, True
15 changes: 8 additions & 7 deletions pandas/io/parquet.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@

from pandas import DataFrame, get_option

from pandas.io.common import get_filepath_or_buffer, is_gcs_url, is_s3_url
from pandas.io.common import get_filepath_or_buffer, is_gcs_url, is_s3_url, get_fs_for_path


def get_engine(engine: str) -> "BaseImpl":
Expand Down Expand Up @@ -111,14 +111,15 @@ def write(
self.api.parquet.write_table(table, path, compression=compression, **kwargs)

def read(self, path, columns=None, **kwargs):
path, _, _, should_close = get_filepath_or_buffer(path)

filepath, _, _, should_close = get_filepath_or_buffer(path)
print("here")
print(path)
parquet_ds = self.api.parquet.ParquetDataset(path, filesystem=get_fs_for_path(path), **kwargs)
kwargs["use_pandas_metadata"] = True
result = self.api.parquet.read_table(
path, columns=columns, **kwargs
).to_pandas()
result = parquet_ds.read(columns=columns, **kwargs).to_pandas()

if should_close:
path.close()
filepath.close()

return result

Expand Down
8 changes: 6 additions & 2 deletions pandas/io/s3.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,10 @@ def _strip_schema(url):
return result.netloc + result.path


def get_fs():
return s3fs.S3FileSystem(anon=False)


def get_file_and_filesystem(
filepath_or_buffer: FilePathOrBuffer, mode: Optional[str] = None
) -> Tuple[IO, Any]:
Expand All @@ -24,7 +28,7 @@ def get_file_and_filesystem(
if mode is None:
mode = "rb"

fs = s3fs.S3FileSystem(anon=False)
fs = get_fs()
try:
file = fs.open(_strip_schema(filepath_or_buffer), mode)
except (FileNotFoundError, NoCredentialsError):
Expand All @@ -34,7 +38,7 @@ def get_file_and_filesystem(
# aren't valid for that bucket.
# A NoCredentialsError is raised if you don't have creds
# for that bucket.
fs = s3fs.S3FileSystem(anon=True)
fs = get_fs()
file = fs.open(_strip_schema(filepath_or_buffer), mode)
return file, fs

Expand Down