Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
69 changes: 58 additions & 11 deletions python/doc/source/api.rst
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,8 @@ API Reference

.. _api.functions:

Type Metadata and Schemas
-------------------------
Type and Schema Factory Functions
---------------------------------

.. autosummary::
:toctree: generated/
Expand All @@ -43,6 +43,8 @@ Type Metadata and Schemas
float16
float32
float64
time32
time64
timestamp
date32
date64
Expand All @@ -53,10 +55,8 @@ Type Metadata and Schemas
struct
dictionary
field
DataType
Field
Schema
schema
from_numpy_dtype

Scalar Value Types
------------------
Expand All @@ -68,6 +68,7 @@ Scalar Value Types
NAType
Scalar
ArrayValue
BooleanValue
Int8Value
Int16Value
Int32Value
Expand All @@ -82,6 +83,11 @@ Scalar Value Types
BinaryValue
StringValue
FixedSizeBinaryValue
Date32Value
Date64Value
TimestampValue
DecimalValue


Array Types and Constructors
----------------------------
Expand All @@ -91,31 +97,42 @@ Array Types and Constructors

array
Array
NullArray
NumericArray
IntegerArray
FloatingPointArray
BooleanArray
DictionaryArray
FloatingPointArray
IntegerArray
Int8Array
Int16Array
Int32Array
Int64Array
NullArray
NumericArray
UInt8Array
UInt16Array
UInt32Array
UInt64Array
DictionaryArray
BinaryArray
FixedSizeBinaryArray
StringArray
Time32Array
Time64Array
Date32Array
Date64Array
TimestampArray
DecimalArray
ListArray
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We might consider reorganizing all the logical types by type class instead of by semantic type (e.g. all the List* things together instead of all the Arrays together)


Tables and Record Batches
-------------------------

.. autosummary::
:toctree: generated/

ChunkedArray
Column
RecordBatch
Table
get_record_batch_size

Tensor type and Functions
-------------------------
Expand All @@ -141,7 +158,7 @@ Input / Output and Shared Memory
MemoryMappedFile
memory_map
create_memory_map
PythonFileInterface
PythonFile

Interprocess Communication and Messaging
----------------------------------------
Expand All @@ -165,3 +182,33 @@ Memory Pools
jemalloc_memory_pool
total_allocated_bytes
set_memory_pool

Type Classes
------------

.. autosummary::
:toctree: generated/

DataType
DecimalType
DictionaryType
FixedSizeBinaryType
Time32Type
Time64Type
TimestampType
Field
Schema

.. currentmodule:: pyarrow.parquet

Apache Parquet
--------------

.. autosummary::
:toctree: generated/

ParquetDataset
ParquetFile
read_table
write_metadata
write_table
33 changes: 25 additions & 8 deletions python/pyarrow/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,12 +31,20 @@
from pyarrow._array import (null, bool_,
int8, int16, int32, int64,
uint8, uint16, uint32, uint64,
timestamp, date32, date64,
time32, time64, timestamp, date32, date64,
float16, float32, float64,
binary, string, decimal,
list_, struct, dictionary, field,
DataType, FixedSizeBinaryType,
Field, Schema, schema,
DataType,
DecimalType,
DictionaryType,
FixedSizeBinaryType,
TimestampType,
Time32Type,
Time64Type,
Field,
Schema,
schema,
Array, Tensor,
array,
from_numpy_dtype,
Expand All @@ -47,25 +55,34 @@
Int16Array, UInt16Array,
Int32Array, UInt32Array,
Int64Array, UInt64Array,
ListArray, StringArray,
ListArray,
BinaryArray, StringArray,
FixedSizeBinaryArray,
DictionaryArray,
Date32Array, Date64Array,
TimestampArray, Time32Array, Time64Array,
DecimalArray,
ArrayValue, Scalar, NA, NAType,
BooleanValue,
Int8Value, Int16Value, Int32Value, Int64Value,
UInt8Value, UInt16Value, UInt32Value, UInt64Value,
FloatValue, DoubleValue, ListValue,
BinaryValue, StringValue, FixedSizeBinaryValue)
BinaryValue, StringValue, FixedSizeBinaryValue,
DecimalValue,
Date32Value, Date64Value, TimestampValue)

from pyarrow._io import (HdfsFile, NativeFile, PythonFileInterface,
from pyarrow._io import (HdfsFile, NativeFile, PythonFile,
Buffer, BufferReader, InMemoryOutputStream,
OSFile, MemoryMappedFile, memory_map,
frombuffer, read_tensor, write_tensor,
memory_map, create_memory_map,
get_record_batch_size, get_tensor_size)
get_record_batch_size, get_tensor_size,
have_libhdfs, have_libhdfs3)

from pyarrow._memory import (MemoryPool, total_allocated_bytes,
set_memory_pool, default_memory_pool)
from pyarrow._table import Column, RecordBatch, Table, concat_tables
from pyarrow._table import (ChunkedArray, Column, RecordBatch, Table,
concat_tables)
from pyarrow._error import (ArrowException,
ArrowKeyError,
ArrowInvalid,
Expand Down
10 changes: 10 additions & 0 deletions python/pyarrow/_array.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,16 @@ cdef class TimestampType(DataType):
const CTimestampType* ts_type


cdef class Time32Type(DataType):
cdef:
const CTime32Type* time_type


cdef class Time64Type(DataType):
cdef:
const CTime64Type* time_type


cdef class FixedSizeBinaryType(DataType):
cdef:
const CFixedSizeBinaryType* fixed_size_binary_type
Expand Down
74 changes: 73 additions & 1 deletion python/pyarrow/_array.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -127,6 +127,30 @@ cdef class TimestampType(DataType):
return None


cdef class Time32Type(DataType):

cdef void init(self, const shared_ptr[CDataType]& type):
DataType.init(self, type)
self.time_type = <const CTime32Type*> type.get()

property unit:

def __get__(self):
return timeunit_to_string(self.time_type.unit())


cdef class Time64Type(DataType):

cdef void init(self, const shared_ptr[CDataType]& type):
DataType.init(self, type)
self.time_type = <const CTime64Type*> type.get()

property unit:

def __get__(self):
return timeunit_to_string(self.time_type.unit())


cdef class FixedSizeBinaryType(DataType):

cdef void init(self, const shared_ptr[CDataType]& type):
Expand Down Expand Up @@ -342,6 +366,7 @@ def int64():


cdef dict _timestamp_type_cache = {}
cdef dict _time_type_cache = {}


cdef timeunit_to_string(TimeUnit unit):
Expand Down Expand Up @@ -369,7 +394,7 @@ def timestamp(unit_str, tz=None):
elif unit_str == 'ns':
unit = TimeUnit_NANO
else:
raise TypeError('Invalid TimeUnit string')
raise ValueError('Invalid TimeUnit string')

cdef TimestampType out = TimestampType()

Expand All @@ -388,6 +413,50 @@ def timestamp(unit_str, tz=None):
return out


def time32(unit_str):
cdef:
TimeUnit unit
c_string c_timezone

if unit_str == "s":
unit = TimeUnit_SECOND
elif unit_str == 'ms':
unit = TimeUnit_MILLI
else:
raise ValueError('Invalid TimeUnit for time32: {}'.format(unit_str))

cdef Time32Type out
if unit in _time_type_cache:
return _time_type_cache[unit]
else:
out = Time32Type()
out.init(ctime32(unit))
_time_type_cache[unit] = out
return out


def time64(unit_str):
cdef:
TimeUnit unit
c_string c_timezone

if unit_str == "us":
unit = TimeUnit_MICRO
elif unit_str == 'ns':
unit = TimeUnit_NANO
else:
raise ValueError('Invalid TimeUnit for time64: {}'.format(unit_str))

cdef Time64Type out
if unit in _time_type_cache:
return _time_type_cache[unit]
else:
out = Time64Type()
out.init(ctime64(unit))
_time_type_cache[unit] = out
return out


def date32():
return primitive_type(_Type_DATE32)

Expand Down Expand Up @@ -516,6 +585,9 @@ cdef Schema box_schema(const shared_ptr[CSchema]& type):


def from_numpy_dtype(object dtype):
"""
Convert NumPy dtype to pyarrow.DataType
"""
cdef shared_ptr[CDataType] c_type
with nogil:
check_status(pyarrow.NumPyDtypeToArrow(dtype, &c_type))
Expand Down
6 changes: 3 additions & 3 deletions python/pyarrow/_io.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -307,7 +307,7 @@ cdef class NativeFile:
# Python file-like objects


cdef class PythonFileInterface(NativeFile):
cdef class PythonFile(NativeFile):
cdef:
object handle

Expand Down Expand Up @@ -600,7 +600,7 @@ cdef get_reader(object source, shared_ptr[RandomAccessFile]* reader):
source = BufferReader(source)
elif not isinstance(source, NativeFile) and hasattr(source, 'read'):
# Optimistically hope this is file-like
source = PythonFileInterface(source, mode='r')
source = PythonFile(source, mode='r')

if isinstance(source, NativeFile):
nf = source
Expand All @@ -622,7 +622,7 @@ cdef get_writer(object source, shared_ptr[OutputStream]* writer):
source = OSFile(source, mode='w')
elif not isinstance(source, NativeFile) and hasattr(source, 'write'):
# Optimistically hope this is file-like
source = PythonFileInterface(source, mode='w')
source = PythonFile(source, mode='w')

if isinstance(source, NativeFile):
nf = source
Expand Down
3 changes: 3 additions & 0 deletions python/pyarrow/includes/libarrow.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,9 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil:
cdef cppclass CTime64Type" arrow::Time64Type"(CFixedWidthType):
TimeUnit unit()

shared_ptr[CDataType] ctime32" arrow::time32"(TimeUnit unit)
shared_ptr[CDataType] ctime64" arrow::time64"(TimeUnit unit)

cdef cppclass CDictionaryType" arrow::DictionaryType"(CFixedWidthType):
CDictionaryType(const shared_ptr[CDataType]& index_type,
const shared_ptr[CArray]& dictionary)
Expand Down
4 changes: 2 additions & 2 deletions python/pyarrow/tests/test_io.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@
def test_python_file_write():
buf = BytesIO()

f = pa.PythonFileInterface(buf)
f = pa.PythonFile(buf)

assert f.tell() == 0

Expand All @@ -56,7 +56,7 @@ def test_python_file_read():
data = b'some sample data'

buf = BytesIO(data)
f = pa.PythonFileInterface(buf, mode='r')
f = pa.PythonFile(buf, mode='r')

assert f.size() == len(data)

Expand Down
Loading