forked from vaexio/vaex
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
1bb65dd
commit bad6103
Showing
18 changed files
with
408 additions
and
8 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,21 @@ | ||
The MIT License (MIT) | ||
|
||
Copyright (c) 2015, Maarten A. Breddels | ||
|
||
Permission is hereby granted, free of charge, to any person obtaining a copy | ||
of this software and associated documentation files (the "Software"), to deal | ||
in the Software without restriction, including without limitation the rights | ||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | ||
copies of the Software, and to permit persons to whom the Software is | ||
furnished to do so, subject to the following conditions: | ||
|
||
The above copyright notice and this permission notice shall be included in all | ||
copies or substantial portions of the Software. | ||
|
||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | ||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | ||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | ||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | ||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | ||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | ||
SOFTWARE. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
include LICENSE.txt |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,29 @@ | ||
import os | ||
import imp | ||
from setuptools import setup | ||
|
||
dirname = os.path.dirname(__file__) | ||
path_version = os.path.join(dirname, "vaex_arrow/_version.py") | ||
version = imp.load_source('version', path_version) | ||
|
||
|
||
name = 'vaex' | ||
author = "Maarten A. Breddels" | ||
author_email = "maartenbreddels@gmail.com" | ||
license = 'MIT' | ||
version = version.__version__ | ||
url = 'https://www.github.com/maartenbreddels/vaex' | ||
install_requires = ["vaex-core", "matplotlib>=1.3.1", "pillow", "pyarrow"] | ||
|
||
setup(name=name + '_arrow', | ||
version=version, | ||
description='Arrow support for vaex', | ||
url=url, | ||
author=author, | ||
author_email=author_email, | ||
install_requires=install_requires, | ||
license=license, | ||
packages=['vaex_arrow'], | ||
zip_safe=False, | ||
entry_points={'vaex.plugin': ['reader = vaex_arrow.opener:register_opener']} | ||
) |
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
__version_tuple__ = (0, 0, 0) | ||
__version__ = '0.0.0' |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,34 @@ | ||
import pyarrow | ||
import numpy as np | ||
from .dataset import DatasetArrow | ||
|
||
def arrow_array_from_numpy_array(array): | ||
dtype = array.dtype | ||
mask = None | ||
if np.ma.isMaskedArray(array): | ||
mask = array.mask | ||
if dtype.kind == 'S': | ||
type = pyarrow.binary(dtype.itemsize) | ||
arrow_array = pyarrow.array(array, type, mask=mask) | ||
else: | ||
if dtype.isnative: | ||
arrow_array = pyarrow.array(array, mask=mask) | ||
else: | ||
# TODO: we copy here, but I guess we should not... or give some warning | ||
arrow_array = pyarrow.array(array.astype(dtype.newbyteorder('=')), mask=mask) | ||
return arrow_array | ||
|
||
|
||
def arrow_table_from_vaex_dataset(ds, column_names=None, selection=None, strings=True, virtual=False): | ||
"""Implementation of Dataset.to_arrow_table""" | ||
names = [] | ||
arrays = [] | ||
for name, array in ds.to_items(column_names=column_names, selection=selection, strings=strings, virtual=virtual): | ||
names.append(name) | ||
arrays.append(arrow_array_from_numpy_array(array)) | ||
# import IPython | ||
# IPython.embed() | ||
return pyarrow.Table.from_arrays(arrays, names) | ||
|
||
def vaex_dataset_from_arrow_table(table): | ||
return DatasetArrow(table=table) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,81 @@ | ||
__author__ = 'maartenbreddels' | ||
import logging | ||
|
||
import numpy as np | ||
import pyarrow as pa | ||
|
||
import vaex.dataset | ||
import vaex.file.other | ||
logger = logging.getLogger("vaex_arrow") | ||
|
||
|
||
class DatasetArrow(vaex.dataset.DatasetLocal): | ||
"""Implements storage using arrow""" | ||
|
||
def __init__(self, filename=None, table=None, write=False): | ||
super(DatasetArrow, self).__init__(name=filename, path=filename, column_names=[]) | ||
self._write = write | ||
if table is None: | ||
self._load() | ||
else: | ||
self._load_table(table) | ||
|
||
def _load(self): | ||
source = pa.memory_map(self.path) | ||
reader = pa.open_stream(source) | ||
table = pa.Table.from_batches([b for b in reader]) | ||
self._load_table(table) | ||
|
||
def _load_table(self, table): | ||
self._length_unfiltered = self._length_original = table.num_rows | ||
for col in table.columns: | ||
name = col.name | ||
# TODO: keep the arrow columns, and support and test chunks | ||
arrow_array = col.data.chunks[0] | ||
arrow_type = arrow_array.type | ||
buffers = arrow_array.buffers() | ||
assert len(buffers) == 2 | ||
mask = None | ||
bitmap_buffer = buffers[0] | ||
data_buffer = buffers[1] | ||
if bitmap_buffer is not None: | ||
# arrow uses a bitmap https://github.com/apache/arrow/blob/master/format/Layout.md | ||
bitmap = np.frombuffer(bitmap_buffer, np.uint8, len(bitmap_buffer)) | ||
# we do have to change the ordering of the bits | ||
mask = 1-np.unpackbits(bitmap).reshape((len(bitmap),8))[:,::-1].reshape(-1)[:len(arrow_array)] | ||
if isinstance(arrow_type, type(pa.binary(1))): | ||
# mimics python/pyarrow/array.pxi::Array::to_numpy | ||
# print(name, "seems to be a bytes type") | ||
buffers = arrow_array.buffers() | ||
assert len(buffers) == 2 | ||
dtype = "S" + str(arrow_type.byte_width) | ||
# arrow seems to do padding, check if it is all ok | ||
expected_length = arrow_type.byte_width * len(arrow_array) | ||
actual_length = len(buffers[-1]) | ||
if actual_length < expected_length: | ||
raise ValueError('buffer is smaller (%d) than expected (%d)' % (actual_length, expected_length)) | ||
array = np.frombuffer(buffers[-1], dtype, len(arrow_array))# TODO: deal with offset ? [arrow_array.offset:arrow_array.offset + len(arrow_array)] | ||
else: | ||
dtype = arrow_array.type.to_pandas_dtype() | ||
array = np.frombuffer(data_buffer, dtype, len(arrow_array)) | ||
if mask is not None: | ||
array = np.ma.MaskedArray(array, mask=mask) | ||
self.columns[name] = array | ||
self.column_names.append(name) | ||
self._save_assign_expression(name, vaex.expression.Expression(self, name)) | ||
|
||
|
||
@classmethod | ||
def can_open(cls, path, *args, **kwargs): | ||
return path.rpartition('.')[2] == 'arrow' | ||
|
||
@classmethod | ||
def get_options(cls, path): | ||
return [] | ||
|
||
@classmethod | ||
def option_to_args(cls, option): | ||
return [] | ||
|
||
vaex.file.other.dataset_type_map["arrow"] = DatasetArrow | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,100 @@ | ||
__author__ = 'maartenbreddels' | ||
import os | ||
import sys | ||
import warnings | ||
import collections | ||
import logging | ||
|
||
import numpy as np | ||
import vaex | ||
|
||
from .convert import arrow_array_from_numpy_array | ||
|
||
max_length = int(1e5) | ||
|
||
on_rtd = os.environ.get('READTHEDOCS', None) == 'True' | ||
try: | ||
import pyarrow as pa | ||
except: | ||
if not on_rtd: | ||
raise | ||
|
||
logger = logging.getLogger("vaex_arrow.export") | ||
|
||
def export(dataset, path, column_names=None, byteorder="=", shuffle=False, selection=False, progress=None, virtual=True, sort=None, ascending=True): | ||
""" | ||
:param DatasetLocal dataset: dataset to export | ||
:param str path: path for file | ||
:param lis[str] column_names: list of column names to export or None for all columns | ||
:param str byteorder: = for native, < for little endian and > for big endian | ||
:param bool shuffle: export rows in random order | ||
:param bool selection: export selection or not | ||
:param progress: progress callback that gets a progress fraction as argument and should return True to continue, | ||
or a default progress bar when progress=True | ||
:param: bool virtual: When True, export virtual columns | ||
:return: | ||
""" | ||
column_names = column_names or dataset.get_column_names(virtual=virtual, strings=True) | ||
for name in column_names: | ||
if name not in dataset.columns: | ||
warnings.warn('Exporting to arrow with virtual columns is not efficient') | ||
N = len(dataset) if not selection else dataset.selected_length(selection) | ||
if N == 0: | ||
raise ValueError("Cannot export empty table") | ||
|
||
if shuffle and sort: | ||
raise ValueError("Cannot shuffle and sort at the same time") | ||
|
||
if shuffle: | ||
random_index_column = "random_index" | ||
while random_index_column in dataset.get_column_names(): | ||
random_index_column += "_new" | ||
partial_shuffle = shuffle and len(dataset) != N | ||
|
||
order_array = None | ||
if partial_shuffle: | ||
# if we only export a portion, we need to create the full length random_index array, and | ||
shuffle_array_full = np.random.choice(len(dataset), len(dataset), replace=False) | ||
# then take a section of it | ||
# shuffle_array[:] = shuffle_array_full[:N] | ||
shuffle_array = shuffle_array_full[shuffle_array_full < N] | ||
del shuffle_array_full | ||
order_array = shuffle_array | ||
elif shuffle: | ||
shuffle_array = np.random.choice(N, N, replace=False) | ||
order_array = shuffle_array | ||
|
||
if sort: | ||
if selection: | ||
raise ValueError("sorting selections not yet supported") | ||
logger.info("sorting...") | ||
indices = np.argsort(dataset.evaluate(sort)) | ||
order_array = indices if ascending else indices[::-1] | ||
logger.info("sorting done") | ||
|
||
if selection: | ||
full_mask = dataset.evaluate_selection_mask(selection) | ||
else: | ||
full_mask = None | ||
|
||
arrow_arrays = [] | ||
for column_name in column_names: | ||
mask = full_mask | ||
if selection: | ||
values = dataset.evaluate(column_name, filtered=False) | ||
values = values[mask] | ||
else: | ||
values = dataset.evaluate(column_name) | ||
if shuffle or sort: | ||
indices = order_array | ||
values = values[indices] | ||
arrow_arrays.append(arrow_array_from_numpy_array(values)) | ||
if shuffle: | ||
arrow_arrays.append(arrow_array_from_numpy_array(order_array)) | ||
column_names = column_names + [random_index_column] | ||
table = pa.Table.from_arrays(arrow_arrays, column_names) | ||
b = table.to_batches() | ||
with pa.OSFile(path, 'wb') as sink: | ||
writer = pa.RecordBatchStreamWriter(sink, b[0].schema) | ||
writer.write_table(table) | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,14 @@ | ||
import vaex.file | ||
|
||
class ArrowOpener: | ||
@staticmethod | ||
def can_open(path, *args, **kwargs): | ||
return path.rpartition('.')[2] == 'arrow' | ||
|
||
@staticmethod | ||
def open(path, *args, **kwargs): | ||
from .dataset import DatasetArrow | ||
return DatasetArrow(path, *args, **kwargs) | ||
|
||
def register_opener(): | ||
vaex.file.register(ArrowOpener) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.