Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions ci/travis_before_script_cpp.sh
Original file line number Diff line number Diff line change
Expand Up @@ -25,8 +25,8 @@ echo $GTEST_HOME

CMAKE_COMMON_FLAGS="\
-DARROW_BUILD_BENCHMARKS=ON \
-DARROW_PARQUET=ON \
-DARROW_HDFS=on \
-DARROW_PARQUET=OFF \
-DARROW_HDFS=ON \
-DCMAKE_INSTALL_PREFIX=$ARROW_CPP_INSTALL"

if [ $TRAVIS_OS_NAME == "linux" ]; then
Expand Down
4 changes: 3 additions & 1 deletion ci/travis_install_conda.sh
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,9 @@ else
fi

wget -O miniconda.sh $MINICONDA_URL
export MINICONDA=$TRAVIS_BUILD_DIR/miniconda

export MINICONDA=$HOME/miniconda

bash miniconda.sh -b -p $MINICONDA
export PATH="$MINICONDA/bin:$PATH"
conda update -y -q conda
Expand Down
6 changes: 4 additions & 2 deletions ci/travis_script_python.sh
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ set -e
PYTHON_DIR=$TRAVIS_BUILD_DIR/python

# Re-use conda installation from C++
export MINICONDA=$TRAVIS_BUILD_DIR/miniconda
export MINICONDA=$HOME/miniconda
export PATH="$MINICONDA/bin:$PATH"
export PARQUET_HOME=$MINICONDA

Expand All @@ -31,7 +31,9 @@ python_version_tests() {
# Expensive dependencies install from Continuum package repo
conda install -y pip numpy pandas cython

conda install -y parquet-cpp arrow-cpp -c apache/channel/dev
# conda install -y parquet-cpp

conda install -y arrow-cpp -c apache/channel/dev

# Other stuff pip install
pip install -r requirements.txt
Expand Down
1 change: 1 addition & 0 deletions cpp/cmake_modules/FindParquet.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,7 @@ else ()
endif ()

mark_as_advanced(
PARQUET_FOUND
PARQUET_INCLUDE_DIR
PARQUET_LIBS
PARQUET_LIBRARIES
Expand Down
6 changes: 6 additions & 0 deletions cpp/src/arrow/util/memory-pool-test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,10 @@ TEST(DefaultMemoryPool, OOM) {
ASSERT_RAISES(OutOfMemory, pool->Allocate(to_alloc, &data));
}

// Death tests and valgrind are known to not play well 100% of the time. See
// googletest documentation
#ifndef ARROW_VALGRIND

TEST(DefaultMemoryPoolDeathTest, FreeLargeMemory) {
MemoryPool* pool = default_memory_pool();

Expand All @@ -60,4 +64,6 @@ TEST(DefaultMemoryPoolDeathTest, FreeLargeMemory) {
pool->Free(data, 100);
}

#endif // ARROW_VALGRIND

} // namespace arrow
41 changes: 25 additions & 16 deletions python/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -340,8 +340,10 @@ if (PYARROW_BUILD_TESTS)
endif()

## Parquet
find_package(Parquet REQUIRED)
include_directories(SYSTEM ${PARQUET_INCLUDE_DIR})
find_package(Parquet)
if(PARQUET_FOUND)
include_directories(SYSTEM ${PARQUET_INCLUDE_DIR})
endif()

## Arrow
find_package(Arrow REQUIRED)
Expand All @@ -350,8 +352,6 @@ ADD_THIRDPARTY_LIB(arrow
SHARED_LIB ${ARROW_SHARED_LIB})
ADD_THIRDPARTY_LIB(arrow_io
SHARED_LIB ${ARROW_IO_SHARED_LIB})
ADD_THIRDPARTY_LIB(arrow_parquet
SHARED_LIB ${ARROW_PARQUET_SHARED_LIB})

############################################################
# Linker setup
Expand Down Expand Up @@ -418,6 +418,16 @@ endif()
add_subdirectory(src/pyarrow)
add_subdirectory(src/pyarrow/util)

set(CYTHON_EXTENSIONS
array
config
error
io
scalar
schema
table
)

set(PYARROW_SRCS
src/pyarrow/common.cc
src/pyarrow/config.cc
Expand All @@ -431,9 +441,19 @@ set(PYARROW_SRCS
set(LINK_LIBS
arrow
arrow_io
arrow_parquet
)

if(PARQUET_FOUND AND ARROW_PARQUET_FOUND)
ADD_THIRDPARTY_LIB(arrow_parquet
SHARED_LIB ${ARROW_PARQUET_SHARED_LIB})
set(LINK_LIBS
${LINK_LIBS}
arrow_parquet)
set(CYTHON_EXTENSIONS
${CYTHON_EXTENSIONS}
parquet)
endif()

SET(CMAKE_INSTALL_RPATH_USE_LINK_PATH TRUE)

add_library(pyarrow SHARED
Expand All @@ -448,17 +468,6 @@ endif()
# Setup and build Cython modules
############################################################

set(CYTHON_EXTENSIONS
array
config
error
io
parquet
scalar
schema
table
)

foreach(module ${CYTHON_EXTENSIONS})
string(REPLACE "." ";" directories ${module})
list(GET directories -1 module_name)
Expand Down
26 changes: 16 additions & 10 deletions python/cmake_modules/FindArrow.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ find_library(ARROW_IO_LIB_PATH NAMES arrow_io
${ARROW_SEARCH_LIB_PATH}
NO_DEFAULT_PATH)

if (ARROW_INCLUDE_DIR AND ARROW_LIB_PATH AND ARROW_PARQUET_LIB_PATH)
if (ARROW_INCLUDE_DIR AND ARROW_LIB_PATH)
set(ARROW_FOUND TRUE)
set(ARROW_LIB_NAME libarrow)
set(ARROW_IO_LIB_NAME libarrow_io)
Expand All @@ -64,18 +64,9 @@ if (ARROW_INCLUDE_DIR AND ARROW_LIB_PATH AND ARROW_PARQUET_LIB_PATH)

set(ARROW_IO_STATIC_LIB ${ARROW_SEARCH_LIB_PATH}/${ARROW_IO_LIB_NAME}.a)
set(ARROW_IO_SHARED_LIB ${ARROW_LIBS}/${ARROW_IO_LIB_NAME}${CMAKE_SHARED_LIBRARY_SUFFIX})

set(ARROW_PARQUET_STATIC_LIB ${ARROW_SEARCH_LIB_PATH}/${ARROW_PARQUET_LIB_NAME}.a)
set(ARROW_PARQUET_SHARED_LIB ${ARROW_LIBS}/${ARROW_PARQUET_LIB_NAME}${CMAKE_SHARED_LIBRARY_SUFFIX})
else ()
set(ARROW_FOUND FALSE)
endif ()

if (ARROW_FOUND)
if (NOT Arrow_FIND_QUIETLY)
message(STATUS "Found the Arrow core library: ${ARROW_LIB_PATH}")
message(STATUS "Found the Arrow IO library: ${ARROW_IO_LIB_PATH}")
message(STATUS "Found the Arrow Parquet library: ${ARROW_PARQUET_LIB_PATH}")
endif ()
else ()
if (NOT Arrow_FIND_QUIETLY)
Expand All @@ -88,8 +79,23 @@ else ()
message(STATUS "${ARROW_ERR_MSG}")
endif (Arrow_FIND_REQUIRED)
endif ()
set(ARROW_FOUND FALSE)
endif ()

if(ARROW_PARQUET_LIB_PATH)
set(ARROW_PARQUET_FOUND TRUE)
set(ARROW_PARQUET_STATIC_LIB ${ARROW_SEARCH_LIB_PATH}/${ARROW_PARQUET_LIB_NAME}.a)
set(ARROW_PARQUET_SHARED_LIB ${ARROW_LIBS}/${ARROW_PARQUET_LIB_NAME}${CMAKE_SHARED_LIBRARY_SUFFIX})
if (NOT Arrow_FIND_QUIETLY)
message(STATUS "Found the Arrow Parquet library: ${ARROW_PARQUET_LIB_PATH}")
endif ()
else()
if (NOT Arrow_FIND_QUIETLY)
message(STATUS "Could not find Arrow Parquet library")
endif()
set(ARROW_PARQUET_FOUND FALSE)
endif()

mark_as_advanced(
ARROW_INCLUDE_DIR
ARROW_LIBS
Expand Down
1 change: 1 addition & 0 deletions python/pyarrow/tests/test_io.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@ def hdfs_test_client():

HDFS_TMP_PATH = '/tmp/pyarrow-test-{0}'.format(random.randint(0, 1000))


@pytest.fixture(scope='session')
def hdfs(request):
fixture = hdfs_test_client()
Expand Down
38 changes: 26 additions & 12 deletions python/pyarrow/tests/test_parquet.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,33 +15,45 @@
# specific language governing permissions and limitations
# under the License.

from pyarrow.compat import unittest
import pyarrow as arrow
import pyarrow.parquet
import pytest

A = arrow
import pyarrow as A

import numpy as np
import os.path
import pandas as pd

import pandas.util.testing as pdt

try:
import pyarrow.parquet as pq
HAVE_PARQUET = True
except ImportError:
HAVE_PARQUET = False

# XXX: Make Parquet tests opt-in rather than skip-if-not-build
parquet = pytest.mark.skipif(not HAVE_PARQUET,
reason='Parquet support not built')


@parquet
def test_single_pylist_column_roundtrip(tmpdir):
for dtype in [int, float]:
filename = tmpdir.join('single_{}_column.parquet'.format(dtype.__name__))
filename = tmpdir.join('single_{}_column.parquet'
.format(dtype.__name__))
data = [A.from_pylist(list(map(dtype, range(5))))]
table = A.Table.from_arrays(('a', 'b'), data, 'table_name')
A.parquet.write_table(table, filename.strpath)
table_read = pyarrow.parquet.read_table(filename.strpath)
for col_written, col_read in zip(table.itercolumns(), table_read.itercolumns()):
table_read = pq.read_table(filename.strpath)
for col_written, col_read in zip(table.itercolumns(),
table_read.itercolumns()):
assert col_written.name == col_read.name
assert col_read.data.num_chunks == 1
data_written = col_written.data.chunk(0)
data_read = col_read.data.chunk(0)
assert data_written.equals(data_read)


@parquet
def test_pandas_parquet_2_0_rountrip(tmpdir):
size = 10000
np.random.seed(0)
Expand All @@ -58,17 +70,20 @@ def test_pandas_parquet_2_0_rountrip(tmpdir):
'float64': np.arange(size, dtype=np.float64),
'bool': np.random.randn(size) > 0,
# Pandas only support ns resolution, Arrow at the moment only ms
'datetime': np.arange("2016-01-01T00:00:00.001", size, dtype='datetime64[ms]'),
'datetime': np.arange("2016-01-01T00:00:00.001", size,
dtype='datetime64[ms]'),
'str': [str(x) for x in range(size)],
'str_with_nulls': [None] + [str(x) for x in range(size - 2)] + [None]
})
filename = tmpdir.join('pandas_rountrip.parquet')
arrow_table = A.from_pandas_dataframe(df, timestamps_to_ms=True)
A.parquet.write_table(arrow_table, filename.strpath, version="2.0")
table_read = pyarrow.parquet.read_table(filename.strpath)
table_read = pq.read_table(filename.strpath)
df_read = table_read.to_pandas()
pdt.assert_frame_equal(df, df_read)


@parquet
def test_pandas_parquet_1_0_rountrip(tmpdir):
size = 10000
np.random.seed(0)
Expand All @@ -88,11 +103,10 @@ def test_pandas_parquet_1_0_rountrip(tmpdir):
filename = tmpdir.join('pandas_rountrip.parquet')
arrow_table = A.from_pandas_dataframe(df)
A.parquet.write_table(arrow_table, filename.strpath, version="1.0")
table_read = pyarrow.parquet.read_table(filename.strpath)
table_read = pq.read_table(filename.strpath)
df_read = table_read.to_pandas()

# We pass uint32_t as int64_t if we write Parquet version 1.0
df['uint32'] = df['uint32'].values.astype(np.int64)

pdt.assert_frame_equal(df, df_read)

7 changes: 1 addition & 6 deletions python/pyarrow/tests/test_table.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,11 +16,7 @@
# under the License.

from pyarrow.compat import unittest
import pyarrow as arrow

A = arrow

import pandas as pd
import pyarrow as A


class TestRowBatch(unittest.TestCase):
Expand Down Expand Up @@ -76,4 +72,3 @@ def test_pandas(self):
assert set(df.columns) == set(('a', 'b'))
assert df.shape == (5, 2)
assert df.ix[0, 'b'] == -10

27 changes: 16 additions & 11 deletions python/setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,18 @@ def initialize_options(self):
_build_ext.initialize_options(self)
self.extra_cmake_args = ''

CYTHON_MODULE_NAMES = [
'array',
'config',
'error',
'io',
'parquet',
'scalar',
'schema',
'table']

CYTHON_ALLOWED_FAILURES = ['parquet']

def _run_cmake(self):
# The directory containing this setup.py
source = osp.dirname(osp.abspath(__file__))
Expand Down Expand Up @@ -172,10 +184,13 @@ def _run_cmake(self):

# Move the built C-extension to the place expected by the Python build
self._found_names = []
for name in self.get_cmake_cython_names():
for name in self.CYTHON_MODULE_NAMES:
built_path = self.get_ext_built(name)
if not os.path.exists(built_path):
print(built_path)
if name in self.CYTHON_ALLOWED_FAILURES:
print('Cython module {0} failure permitted'.format(name))
continue
raise RuntimeError('libpyarrow C-extension failed to build:',
os.path.abspath(built_path))

Expand Down Expand Up @@ -213,16 +228,6 @@ def get_ext_built(self, name):
suffix = sysconfig.get_config_var('SO')
return name + suffix

def get_cmake_cython_names(self):
return ['array',
'config',
'error',
'io',
'parquet',
'scalar',
'schema',
'table']

def get_names(self):
return self._found_names

Expand Down