added arrow support

fcrimins · Nov 1, 2018 · bad6103 · bad6103
1 parent 1bb65dd
commit bad6103
Show file tree

Hide file tree

Showing 18 changed files with 408 additions and 8 deletions.
diff --git a/.releash.py b/.releash.py
@@ -15,7 +15,7 @@
 #core.release_targets.append(gitpush)
 core.release_targets.append(ReleaseTargetCondaForge(core, '../feedstocks/vaex-core-feedstock'))
 
-packages = ['vaex-core', 'vaex-meta', 'vaex-viz', 'vaex-hdf5', 'vaex-server', 'vaex-astro', 'vaex-ui', 'vaex-jupyter', 'vaex-distributed']
+packages = ['vaex-core', 'vaex-meta', 'vaex-viz', 'vaex-hdf5', 'vaex-server', 'vaex-astro', 'vaex-ui', 'vaex-jupyter', 'vaex-distributed', 'vaex-arrow']
 names = [k[5:] for k in packages[1:]]
 
 for name in names:

diff --git a/docs/source/api.rst b/docs/source/api.rst
@@ -8,6 +8,7 @@ Quick list for opening/reading in your data.
 .. autosummary::
 
     vaex.open
+    vaex.from_arrow_table
     vaex.from_arrays
     vaex.from_csv
     vaex.from_ascii
@@ -64,6 +65,14 @@ Dataset class
      :special-members:
 
 
+DatasetLocal class
+-------------
+
+.. autoclass:: vaex.dataset.DatasetLocal
+     :members:
+     :special-members:
+
+
 vaex.stat module
 ----------------
 

diff --git a/docs/source/datasets.ipynb b/docs/source/datasets.ipynb
@@ -206,7 +206,20 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.6.3"
+   "version": "3.6.6"
+  },
+  "toc": {
+   "base_numbering": 1,
+   "nav_menu": {},
+   "number_sections": false,
+   "sideBar": false,
+   "skip_h1_title": false,
+   "title_cell": "Table of Contents",
+   "title_sidebar": "Contents",
+   "toc_cell": false,
+   "toc_position": {},
+   "toc_section_display": false,
+   "toc_window_display": false
   }
  },
  "nbformat": 4,

diff --git a/docs/source/index.ipynb b/docs/source/index.ipynb
@@ -67,6 +67,7 @@
     " * **Lean:** separated into multiple packages\n",
     "    * `vaex-core`: Dataset and core algorithms, takes numpy arrays as input columns.\n",
     "    * `vaex-hdf5`: Provides memory mapped numpy arrays to a Dataset.\n",
+    "    * `vaex-arrow`: [Arrow](https://arrow.apache.org/) support for cross language data sharing.\n",
     "    * `vaex-viz`: Visualization based on matplotlib.\n",
     "    * `vaex-jupyter`: Interactive visualization based on Jupyter widgets / ipywidgets, bqplot, ipyvolume and ipyleaflet.\n",
     "    * `vaex-astro`: Astronomy related transformations and FITS file support.\n",

diff --git a/docs/source/tutorial.ipynb b/docs/source/tutorial.ipynb
@@ -633,12 +633,30 @@
    "source": [
     "Other quick ways to get your data in are:\n",
     " \n",
+    "  * [from_arrow_table](api.rst#vaex.from_arrow_table): [Arrow](https://arrow.apache.org/) table support.\n",
     "  * [from_csv](api.rst#vaex.from_csv): Comma separated files\n",
     "  * [from_ascii](api.rst#vaex.from_ascii): Space/tab separated files\n",
     "  * [from_pandas](api.rst#vaex.from_pandas): Converts a pandas DataFrame\n",
-    "  * [from_astropy_table](api.rst#vaex.from_astropy_table): Converts a astropy table"
+    "  * [from_astropy_table](api.rst#vaex.from_astropy_table): Converts a astropy table\n",
+    "\n",
+    "Exporting, or converting a dataset to a different datastructure is also quite easy:\n",
+    " \n",
+    " * [Dataset.to_arrow_table](api.rst#vaex.dataset.Dataset.to_arrow_table)\n",
+    " * [Dataset.to_pandas_df](api.rst#vaex.dataset.Dataset.to_pandas_df)\n",
+    " * [Dataset.export](api.rst#vaex.dataset.DatasetLocal.export)\n",
+    " * [Dataset.export_hdf5](api.rst#vaex.dataset.DatasetLocal.export_hdf5)\n",
+    " * [Dataset.export_arrow](api.rst#vaex.dataset.DatasetLocal.export_arrow)\n",
+    " * [Dataset.export_fits](api.rst#vaex.dataset.DatasetLocal.export_fits)\n",
+    " "
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
   {
    "cell_type": "markdown",
    "metadata": {},

diff --git a/packages/vaex-arrow/LICENSE.txt b/packages/vaex-arrow/LICENSE.txt
@@ -0,0 +1,21 @@
+The MIT License (MIT)
+
+Copyright (c) 2015, Maarten A. Breddels
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/packages/vaex-arrow/MANIFEST.in b/packages/vaex-arrow/MANIFEST.in
@@ -0,0 +1 @@
+include LICENSE.txt
diff --git a/packages/vaex-arrow/setup.py b/packages/vaex-arrow/setup.py
@@ -0,0 +1,29 @@
+import os
+import imp
+from setuptools import setup
+
+dirname = os.path.dirname(__file__)
+path_version = os.path.join(dirname, "vaex_arrow/_version.py")
+version = imp.load_source('version', path_version)
+
+
+name = 'vaex'
+author = "Maarten A. Breddels"
+author_email = "maartenbreddels@gmail.com"
+license = 'MIT'
+version = version.__version__
+url = 'https://www.github.com/maartenbreddels/vaex'
+install_requires = ["vaex-core", "matplotlib>=1.3.1", "pillow", "pyarrow"]
+
+setup(name=name + '_arrow',
+      version=version,
+      description='Arrow support for vaex',
+      url=url,
+      author=author,
+      author_email=author_email,
+      install_requires=install_requires,
+      license=license,
+      packages=['vaex_arrow'],
+      zip_safe=False,
+      entry_points={'vaex.plugin': ['reader = vaex_arrow.opener:register_opener']}
+      )
diff --git a/packages/vaex-arrow/vaex_arrow/__init__.py b/packages/vaex-arrow/vaex_arrow/__init__.py
diff --git a/packages/vaex-arrow/vaex_arrow/_version.py b/packages/vaex-arrow/vaex_arrow/_version.py
@@ -0,0 +1,2 @@
+__version_tuple__ = (0, 0, 0)
+__version__ = '0.0.0'
diff --git a/packages/vaex-arrow/vaex_arrow/convert.py b/packages/vaex-arrow/vaex_arrow/convert.py
@@ -0,0 +1,34 @@
+import pyarrow
+import numpy as np
+from .dataset import DatasetArrow
+
+def arrow_array_from_numpy_array(array):
+    dtype = array.dtype
+    mask = None
+    if np.ma.isMaskedArray(array):
+        mask = array.mask
+    if dtype.kind == 'S':
+        type = pyarrow.binary(dtype.itemsize)
+        arrow_array = pyarrow.array(array, type, mask=mask)
+    else:
+        if dtype.isnative:
+            arrow_array = pyarrow.array(array, mask=mask)
+        else:
+            # TODO: we copy here, but I guess we should not... or give some warning
+            arrow_array = pyarrow.array(array.astype(dtype.newbyteorder('=')), mask=mask)
+    return arrow_array
+
+
+def arrow_table_from_vaex_dataset(ds, column_names=None, selection=None, strings=True, virtual=False):
+    """Implementation of Dataset.to_arrow_table"""
+    names = []
+    arrays = []
+    for name, array in ds.to_items(column_names=column_names, selection=selection, strings=strings, virtual=virtual):
+        names.append(name)
+        arrays.append(arrow_array_from_numpy_array(array))
+    # import IPython
+    # IPython.embed()
+    return pyarrow.Table.from_arrays(arrays, names)
+
+def vaex_dataset_from_arrow_table(table):
+    return DatasetArrow(table=table)
diff --git a/packages/vaex-arrow/vaex_arrow/dataset.py b/packages/vaex-arrow/vaex_arrow/dataset.py
@@ -0,0 +1,81 @@
+__author__ = 'maartenbreddels'
+import logging
+
+import numpy as np
+import pyarrow as pa
+
+import vaex.dataset
+import vaex.file.other
+logger = logging.getLogger("vaex_arrow")
+
+
+class DatasetArrow(vaex.dataset.DatasetLocal):
+    """Implements storage using arrow"""
+
+    def __init__(self, filename=None, table=None, write=False):
+        super(DatasetArrow, self).__init__(name=filename, path=filename, column_names=[])
+        self._write = write
+        if table is None:
+            self._load()
+        else:
+            self._load_table(table)
+
+    def _load(self):
+        source = pa.memory_map(self.path)
+        reader = pa.open_stream(source)
+        table = pa.Table.from_batches([b for b in reader])
+        self._load_table(table)
+
+    def _load_table(self, table):
+        self._length_unfiltered =  self._length_original = table.num_rows
+        for col in table.columns:
+            name = col.name
+            # TODO: keep the arrow columns, and support and test chunks
+            arrow_array = col.data.chunks[0]
+            arrow_type = arrow_array.type
+            buffers = arrow_array.buffers()
+            assert len(buffers) == 2
+            mask = None
+            bitmap_buffer = buffers[0]
+            data_buffer = buffers[1]
+            if bitmap_buffer is not None:
+                # arrow uses a bitmap https://github.com/apache/arrow/blob/master/format/Layout.md
+                bitmap = np.frombuffer(bitmap_buffer, np.uint8, len(bitmap_buffer))
+                # we do have to change the ordering of the bits
+                mask = 1-np.unpackbits(bitmap).reshape((len(bitmap),8))[:,::-1].reshape(-1)[:len(arrow_array)]
+            if isinstance(arrow_type, type(pa.binary(1))):
+                # mimics python/pyarrow/array.pxi::Array::to_numpy
+                # print(name, "seems to be a bytes type")
+                buffers = arrow_array.buffers()
+                assert len(buffers) == 2
+                dtype = "S" + str(arrow_type.byte_width)
+                # arrow seems to do padding, check if it is all ok
+                expected_length = arrow_type.byte_width * len(arrow_array)
+                actual_length = len(buffers[-1])
+                if actual_length < expected_length:
+                    raise ValueError('buffer is smaller (%d) than expected (%d)' % (actual_length, expected_length))
+                array = np.frombuffer(buffers[-1], dtype, len(arrow_array))# TODO: deal with offset ? [arrow_array.offset:arrow_array.offset + len(arrow_array)]
+            else:
+                dtype = arrow_array.type.to_pandas_dtype()
+            array = np.frombuffer(data_buffer, dtype, len(arrow_array))
+            if mask is not None:
+                array = np.ma.MaskedArray(array, mask=mask)
+            self.columns[name] = array
+            self.column_names.append(name)
+            self._save_assign_expression(name, vaex.expression.Expression(self, name))
+
+
+    @classmethod
+    def can_open(cls, path, *args, **kwargs):
+        return path.rpartition('.')[2] == 'arrow'
+
+    @classmethod
+    def get_options(cls, path):
+        return []
+
+    @classmethod
+    def option_to_args(cls, option):
+        return []
+
+vaex.file.other.dataset_type_map["arrow"] = DatasetArrow
+
diff --git a/packages/vaex-arrow/vaex_arrow/export.py b/packages/vaex-arrow/vaex_arrow/export.py
@@ -0,0 +1,100 @@
+__author__ = 'maartenbreddels'
+import os
+import sys
+import warnings
+import collections
+import logging
+
+import numpy as np
+import vaex
+
+from .convert import arrow_array_from_numpy_array
+
+max_length = int(1e5)
+
+on_rtd = os.environ.get('READTHEDOCS', None) == 'True'
+try:
+    import pyarrow as pa
+except:
+    if not on_rtd:
+        raise
+
+logger = logging.getLogger("vaex_arrow.export")
+
+def export(dataset, path, column_names=None, byteorder="=", shuffle=False, selection=False, progress=None, virtual=True, sort=None, ascending=True):
+    """
+    :param DatasetLocal dataset: dataset to export
+    :param str path: path for file
+    :param lis[str] column_names: list of column names to export or None for all columns
+    :param str byteorder: = for native, < for little endian and > for big endian
+    :param bool shuffle: export rows in random order
+    :param bool selection: export selection or not
+    :param progress: progress callback that gets a progress fraction as argument and should return True to continue,
+            or a default progress bar when progress=True
+    :param: bool virtual: When True, export virtual columns
+    :return:
+    """
+    column_names = column_names or dataset.get_column_names(virtual=virtual, strings=True)
+    for name in column_names:
+        if name not in dataset.columns:
+            warnings.warn('Exporting to arrow with virtual columns is not efficient')
+    N = len(dataset) if not selection else dataset.selected_length(selection)
+    if N == 0:
+        raise ValueError("Cannot export empty table")
+
+    if shuffle and sort:
+        raise ValueError("Cannot shuffle and sort at the same time")
+
+    if shuffle:
+        random_index_column = "random_index"
+        while random_index_column in dataset.get_column_names():
+            random_index_column += "_new"
+    partial_shuffle = shuffle and len(dataset) != N
+
+    order_array = None
+    if partial_shuffle:
+        # if we only export a portion, we need to create the full length random_index array, and
+        shuffle_array_full = np.random.choice(len(dataset), len(dataset), replace=False)
+        # then take a section of it
+        # shuffle_array[:] = shuffle_array_full[:N]
+        shuffle_array = shuffle_array_full[shuffle_array_full < N]
+        del shuffle_array_full
+        order_array = shuffle_array
+    elif shuffle:
+        shuffle_array = np.random.choice(N, N, replace=False)
+        order_array = shuffle_array
+
+    if sort:
+        if selection:
+            raise ValueError("sorting selections not yet supported")
+        logger.info("sorting...")
+        indices = np.argsort(dataset.evaluate(sort))
+        order_array = indices if ascending else indices[::-1]
+        logger.info("sorting done")
+
+    if selection:
+        full_mask = dataset.evaluate_selection_mask(selection)
+    else:
+        full_mask = None
+
+    arrow_arrays = []
+    for column_name in column_names:
+        mask = full_mask
+        if selection:
+            values = dataset.evaluate(column_name, filtered=False)
+            values = values[mask]
+        else:
+            values = dataset.evaluate(column_name)
+            if shuffle or sort:
+                indices = order_array
+                values = values[indices]
+        arrow_arrays.append(arrow_array_from_numpy_array(values))
+    if shuffle:
+        arrow_arrays.append(arrow_array_from_numpy_array(order_array))
+        column_names = column_names + [random_index_column]
+    table = pa.Table.from_arrays(arrow_arrays, column_names)
+    b = table.to_batches()
+    with pa.OSFile(path, 'wb') as sink:
+        writer = pa.RecordBatchStreamWriter(sink, b[0].schema)
+        writer.write_table(table)
+
diff --git a/packages/vaex-arrow/vaex_arrow/opener.py b/packages/vaex-arrow/vaex_arrow/opener.py
@@ -0,0 +1,14 @@
+import vaex.file
+
+class ArrowOpener:
+    @staticmethod
+    def can_open(path, *args, **kwargs):
+        return path.rpartition('.')[2] == 'arrow'
+
+    @staticmethod
+    def open(path, *args, **kwargs):
+        from .dataset import DatasetArrow
+        return DatasetArrow(path, *args, **kwargs)
+
+def register_opener():
+    vaex.file.register(ArrowOpener)
diff --git a/packages/vaex-core/vaex/__init__.py b/packages/vaex-core/vaex/__init__.py
@@ -289,6 +289,10 @@ def from_arrays(**arrays):
         dataset.add_column(name, array)
     return dataset
 
+def from_arrow_table(table):
+    """Creates a vaex dataset from an arrow Table"""
+    from vaex_arrow.convert import vaex_dataset_from_arrow_table
+    return vaex_dataset_from_arrow_table(table=table)
 
 def from_scalars(**kwargs):
     """Similar to from_arrays, but convenient for a dataset of length 1
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		__version_tuple__ = (0, 0, 0)
		__version__ = '0.0.0'