Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions ci/travis_lint.sh
Original file line number Diff line number Diff line change
Expand Up @@ -43,9 +43,9 @@ if [ "$ARROW_CI_PYTHON_AFFECTED" != "0" ]; then

PYTHON_DIR=$TRAVIS_BUILD_DIR/python

flake8 --count $PYTHON_DIR/pyarrow
flake8 --count $PYTHON_DIR

# Check Cython files with some checks turned off
flake8 --count --config=$PYTHON_DIR/.flake8.cython \
$PYTHON_DIR/pyarrow
$PYTHON_DIR
fi
1 change: 0 additions & 1 deletion python/benchmarks/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,4 +14,3 @@
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.

1 change: 0 additions & 1 deletion python/benchmarks/array_ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,6 @@
# specific language governing permissions and limitations
# under the License.

import numpy as np
import pyarrow as pa


Expand Down
20 changes: 9 additions & 11 deletions python/benchmarks/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@
import decimal
from functools import partial
import itertools
import os
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

flake8 fails on line 41 as it's Python3 only. We should either only execute it for Python 3 or make it Python2 compatible.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

fixed

import sys
import unicodedata

Expand All @@ -39,7 +38,7 @@ def _multiplicate_sequence(base, target_size):
return [base] * q + [base[:r]]


def get_random_bytes(n, *, seed=42):
def get_random_bytes(n, seed=42):
"""
Generate a random bytes object of size *n*.
Note the result might be compressible.
Expand All @@ -58,7 +57,7 @@ def get_random_bytes(n, *, seed=42):
return result


def get_random_ascii(n, *, seed=42):
def get_random_ascii(n, seed=42):
"""
Get a random ASCII-only unicode string of size *n*.
"""
Expand All @@ -69,7 +68,7 @@ def get_random_ascii(n, *, seed=42):
return result


def _random_unicode_letters(n, *, seed=42):
def _random_unicode_letters(n, seed=42):
"""
Generate a string of random unicode letters (slow).
"""
Expand All @@ -93,7 +92,7 @@ def _get_more_candidates():
_1024_random_unicode_letters = _random_unicode_letters(1024)


def get_random_unicode(n, *, seed=42):
def get_random_unicode(n, seed=42):
"""
Get a random non-ASCII unicode string of size *n*.
"""
Expand Down Expand Up @@ -179,7 +178,8 @@ def generate_object_list(self, n, none_prob=DEFAULT_NONE_PROB):
self.sprinkle_nones(data, none_prob)
return data

def _generate_varying_sequences(self, random_factory, n, min_size, max_size, none_prob):
def _generate_varying_sequences(self, random_factory, n, min_size,
max_size, none_prob):
"""
Generate a list of *n* sequences of varying size between *min_size*
and *max_size*, with *none_prob* probability of an entry being None.
Expand Down Expand Up @@ -207,7 +207,6 @@ def generate_fixed_binary_list(self, n, size, none_prob=DEFAULT_NONE_PROB):
return self._generate_varying_sequences(get_random_bytes, n,
size, size, none_prob)


def generate_varying_binary_list(self, n, min_size, max_size,
none_prob=DEFAULT_NONE_PROB):
"""
Expand All @@ -217,7 +216,6 @@ def generate_varying_binary_list(self, n, min_size, max_size,
return self._generate_varying_sequences(get_random_bytes, n,
min_size, max_size, none_prob)


def generate_ascii_string_list(self, n, min_size, max_size,
none_prob=DEFAULT_NONE_PROB):
"""
Expand All @@ -227,7 +225,6 @@ def generate_ascii_string_list(self, n, min_size, max_size,
return self._generate_varying_sequences(get_random_ascii, n,
min_size, max_size, none_prob)


def generate_unicode_string_list(self, n, min_size, max_size,
none_prob=DEFAULT_NONE_PROB):
"""
Expand All @@ -237,7 +234,6 @@ def generate_unicode_string_list(self, n, min_size, max_size,
return self._generate_varying_sequences(get_random_unicode, n,
min_size, max_size, none_prob)


def generate_int_list_list(self, n, min_size, max_size,
none_prob=DEFAULT_NONE_PROB):
"""
Expand All @@ -263,7 +259,9 @@ def generate_tuple_list(self, n, none_prob=DEFAULT_NONE_PROB):
def generate_dict_list(self, n, none_prob=DEFAULT_NONE_PROB):
"""
Generate a list of dicts with random values.
Each dict has the form `{'u': int value, 'v': float value, 'w': bool value}`
Each dict has the form

`{'u': int value, 'v': float value, 'w': bool value}`
"""
ints = self.generate_int_list(n, none_prob=none_prob)
floats = self.generate_float_list(n, none_prob=none_prob)
Expand Down
2 changes: 0 additions & 2 deletions python/benchmarks/microbenchmarks.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,6 @@
# specific language governing permissions and limitations
# under the License.

import pyarrow as pa
import pyarrow.benchmark as pb

from . import common
Expand Down Expand Up @@ -44,4 +43,3 @@ def setup(self, type_name):

def time_PandasObjectIsNull(self, *args):
pb.benchmark_PandasObjectIsNull(self.lst)

9 changes: 4 additions & 5 deletions python/benchmarks/plasma.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,11 +18,8 @@
import numpy as np
import timeit

import pyarrow as pa
import pyarrow.plasma as plasma

from . import common


class SimplePlasmaThroughput(object):
"""Benchmark plasma store throughput with a single client."""
Expand All @@ -32,7 +29,8 @@ class SimplePlasmaThroughput(object):
timer = timeit.default_timer

def setup(self, size):
self.plasma_store_ctx = plasma.start_plasma_store(plasma_store_memory=10**9)
self.plasma_store_ctx = plasma.start_plasma_store(
plasma_store_memory=10**9)
plasma_store_name, p = self.plasma_store_ctx.__enter__()
self.plasma_client = plasma.connect(plasma_store_name, "", 64)

Expand All @@ -51,7 +49,8 @@ class SimplePlasmaLatency(object):
timer = timeit.default_timer

def setup(self):
self.plasma_store_ctx = plasma.start_plasma_store(plasma_store_memory=10**9)
self.plasma_store_ctx = plasma.start_plasma_store(
plasma_store_memory=10**9)
plasma_store_name, p = self.plasma_store_ctx.__enter__()
self.plasma_client = plasma.connect(plasma_store_name, "", 64)

Expand Down
15 changes: 9 additions & 6 deletions python/benchmarks/streaming.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,12 +26,15 @@
def generate_chunks(total_size, nchunks, ncols, dtype=np.dtype('int64')):
rowsize = total_size // nchunks // ncols
assert rowsize % dtype.itemsize == 0

def make_column(col, chunk):
return np.frombuffer(common.get_random_bytes(
rowsize, seed=col + 997 * chunk)).view(dtype)

return [pd.DataFrame({
'c' + str(col): np.frombuffer(
common.get_random_bytes(rowsize, seed=col + 997 * chunk)).view(dtype)
for col in range(ncols)
})
for chunk in range(nchunks)]
'c' + str(col): make_column(col, chunk)
for col in range(ncols)})
for chunk in range(nchunks)]


class StreamReader(object):
Expand Down Expand Up @@ -64,4 +67,4 @@ def setup(self, chunk_size):
def time_read_to_dataframe(self, *args):
reader = pa.RecordBatchStreamReader(self.source)
table = reader.read_all()
df = table.to_pandas()
df = table.to_pandas() # noqa
6 changes: 3 additions & 3 deletions python/doc/source/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@
# add these directories to sys.path here. If the directory is relative to the
# documentation root, use os.path.abspath to make it absolute, like shown here.
#
import glob
import os
import sys

Expand Down Expand Up @@ -77,7 +78,6 @@
# source_suffix = ['.rst', '.md']
source_suffix = '.rst'

import glob
autosummary_generate = glob.glob("*.rst")

# The encoding of source files.
Expand Down Expand Up @@ -187,8 +187,8 @@
# html_logo = None

# The name of an image file (relative to this directory) to use as a favicon of
# the docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32
# pixels large.
# the docs. This file should be a Windows icon file (.ico) being 16x16 or
# 32x32 pixels large.
#
# html_favicon = None

Expand Down
4 changes: 2 additions & 2 deletions python/examples/plasma/sorting/multimerge.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -23,9 +23,9 @@ from libc.stdint cimport uintptr_t
from libcpp.vector cimport vector
from libcpp.pair cimport pair

cimport numpy as np
import numpy as np

cimport numpy as np

cdef extern from "<queue>" namespace "std" nogil:
cdef cppclass priority_queue[T]:
Expand All @@ -44,7 +44,7 @@ def multimerge2d(*arrays):
This assumes C style ordering for both input and output arrays. For
each input array we have array[i,0] <= array[i+1,0] and for the output
array the same will hold.

Ideally this code would be simpler and also support both C style
and Fortran style ordering.
"""
Expand Down
1 change: 0 additions & 1 deletion python/examples/plasma/sorting/sort_df.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,6 @@

from multiprocessing import Pool
import numpy as np
import os
import pandas as pd
import pyarrow as pa
import pyarrow.plasma as plasma
Expand Down
1 change: 1 addition & 0 deletions python/scripts/test_leak.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,4 +57,5 @@ def leak2():

gc.collect()


leak2()
26 changes: 16 additions & 10 deletions python/setup.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
#!/usr/bin/env python

# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
Expand Down Expand Up @@ -102,7 +101,8 @@ def run(self):
('with-static-boost', None, 'link boost statically'),
('with-plasma', None, 'build the Plasma extension'),
('with-orc', None, 'build the ORC extension'),
('generate-coverage', None, 'enable Cython code coverage'),
('generate-coverage', None,
'enable Cython code coverage'),
('bundle-boost', None,
'bundle the (shared) Boost libraries'),
('bundle-arrow-cpp', None,
Expand All @@ -116,7 +116,8 @@ def initialize_options(self):
self.cmake_generator = 'Visual Studio 14 2015 Win64'
self.extra_cmake_args = os.environ.get('PYARROW_CMAKE_OPTIONS', '')
self.build_type = os.environ.get('PYARROW_BUILD_TYPE', 'debug').lower()
self.boost_namespace = os.environ.get('PYARROW_BOOST_NAMESPACE', 'boost')
self.boost_namespace = os.environ.get('PYARROW_BOOST_NAMESPACE',
'boost')

self.cmake_cxxflags = os.environ.get('PYARROW_CXXFLAGS', '')

Expand Down Expand Up @@ -252,7 +253,8 @@ def _run_cmake(self):
print("-- Finished cmake for pyarrow")
# Do the build
print("-- Running cmake --build for pyarrow")
self.spawn(['cmake', '--build', '.', '--config', self.build_type])
self.spawn(['cmake', '--build', '.', '--config',
self.build_type])
print("-- Finished cmake --build for pyarrow")

if self.inplace:
Expand Down Expand Up @@ -297,14 +299,16 @@ def _run_cmake(self):
shutil.move(pjoin(build_prefix, 'include'),
pjoin(build_lib, 'pyarrow'))

# Move the built C-extension to the place expected by the Python build
# Move the built C-extension to the place expected by the Python
# build
self._found_names = []
for name in self.CYTHON_MODULE_NAMES:
built_path = self.get_ext_built(name)
if not os.path.exists(built_path):
print(built_path)
if self._failure_permitted(name):
print('Cython module {0} failure permitted'.format(name))
print('Cython module {0} failure permitted'
.format(name))
continue
raise RuntimeError('pyarrow C-extension failed to build:',
os.path.abspath(built_path))
Expand Down Expand Up @@ -337,11 +341,11 @@ def _run_cmake(self):

if os.path.exists(self.get_ext_built_api_header(name)):
shutil.move(self.get_ext_built_api_header(name),
pjoin(os.path.dirname(ext_path), name + '_api.h'))
pjoin(os.path.dirname(ext_path),
name + '_api.h'))

# Move the plasma store
if self.with_plasma:
build_py = self.get_finalized_command('build_py')
source = os.path.join(self.build_type, "plasma_store")
target = os.path.join(build_lib,
self._get_build_dir(),
Expand Down Expand Up @@ -486,7 +490,8 @@ def has_ext_modules(foo):
def parse_version(root):
from setuptools_scm import version_from_scm
import setuptools_scm.git
describe = setuptools_scm.git.DEFAULT_DESCRIBE + " --match 'apache-arrow-[0-9]*'"
describe = (setuptools_scm.git.DEFAULT_DESCRIBE +
" --match 'apache-arrow-[0-9]*'")
# Strip catchall from the commandline
describe = describe.replace("--match *.*", "")
version = setuptools_scm.git.parse(root, describe)
Expand Down Expand Up @@ -520,7 +525,8 @@ def parse_version(root):
'plasma_store = pyarrow:_plasma_store_entry_point'
]
},
use_scm_version={"root": "..", "relative_to": __file__, "parse": parse_version},
use_scm_version={"root": "..", "relative_to": __file__,
"parse": parse_version},
setup_requires=['setuptools_scm', 'cython >= 0.27'] + setup_requires,
install_requires=install_requires,
tests_require=['pytest', 'pandas'],
Expand Down
2 changes: 0 additions & 2 deletions python/testing/parquet_interop.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,10 +16,8 @@
# under the License.

import os
import pytest

import fastparquet
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
import pandas.util.testing as tm
Expand Down