Skip to content

Commit 1a90629

Browse files
authored
Merge branch 'master' into series_rolling_count_ignores_min_periods
2 parents bfe10f0 + bbcda98 commit 1a90629

File tree

105 files changed

+1920
-1310
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

105 files changed

+1920
-1310
lines changed

asv_bench/benchmarks/reshape.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -161,6 +161,9 @@ def time_pivot_table_categorical_observed(self):
161161
observed=True,
162162
)
163163

164+
def time_pivot_table_margins_only_column(self):
165+
self.df.pivot_table(columns=["key2", "key3"], margins=True)
166+
164167

165168
class Crosstab:
166169
def setup(self):

ci/deps/azure-37-locale.yaml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,3 +34,6 @@ dependencies:
3434
- xlsxwriter
3535
- xlwt
3636
- pyarrow>=0.15
37+
- pip
38+
- pip:
39+
- pyxlsb

ci/deps/azure-macos-36.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,3 +33,4 @@ dependencies:
3333
- pip
3434
- pip:
3535
- pyreadstat
36+
- pyxlsb

ci/deps/azure-windows-37.yaml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,3 +35,6 @@ dependencies:
3535
- xlsxwriter
3636
- xlwt
3737
- pyreadstat
38+
- pip
39+
- pip:
40+
- pyxlsb

ci/deps/travis-36-cov.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -51,3 +51,4 @@ dependencies:
5151
- coverage
5252
- pandas-datareader
5353
- python-dateutil
54+
- pyxlsb

ci/print_skipped.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
#!/usr/bin/env python
1+
#!/usr/bin/env python3
22
import os
33
import xml.etree.ElementTree as et
44

doc/make.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
#!/usr/bin/env python
1+
#!/usr/bin/env python3
22
"""
33
Python script for building documentation.
44

doc/source/getting_started/install.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -264,6 +264,7 @@ pyarrow 0.12.0 Parquet, ORC (requires 0.13.0), and
264264
pymysql 0.7.11 MySQL engine for sqlalchemy
265265
pyreadstat SPSS files (.sav) reading
266266
pytables 3.4.2 HDF5 reading / writing
267+
pyxlsb 1.0.5 Reading for xlsb files
267268
qtpy Clipboard I/O
268269
s3fs 0.3.0 Amazon S3 access
269270
tabulate 0.8.3 Printing in Markdown-friendly format (see `tabulate`_)

doc/source/user_guide/io.rst

Lines changed: 70 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@ The pandas I/O API is a set of top level ``reader`` functions accessed like
2323
text;`JSON <https://www.json.org/>`__;:ref:`read_json<io.json_reader>`;:ref:`to_json<io.json_writer>`
2424
text;`HTML <https://en.wikipedia.org/wiki/HTML>`__;:ref:`read_html<io.read_html>`;:ref:`to_html<io.html>`
2525
text; Local clipboard;:ref:`read_clipboard<io.clipboard>`;:ref:`to_clipboard<io.clipboard>`
26-
binary;`MS Excel <https://en.wikipedia.org/wiki/Microsoft_Excel>`__;:ref:`read_excel<io.excel_reader>`;:ref:`to_excel<io.excel_writer>`
26+
;`MS Excel <https://en.wikipedia.org/wiki/Microsoft_Excel>`__;:ref:`read_excel<io.excel_reader>`;:ref:`to_excel<io.excel_writer>`
2727
binary;`OpenDocument <http://www.opendocumentformat.org>`__;:ref:`read_excel<io.ods>`;
2828
binary;`HDF5 Format <https://support.hdfgroup.org/HDF5/whatishdf5.html>`__;:ref:`read_hdf<io.hdf5>`;:ref:`to_hdf<io.hdf5>`
2929
binary;`Feather Format <https://github.com/wesm/feather>`__;:ref:`read_feather<io.feather>`;:ref:`to_feather<io.feather>`
@@ -2768,7 +2768,8 @@ Excel files
27682768

27692769
The :func:`~pandas.read_excel` method can read Excel 2003 (``.xls``)
27702770
files using the ``xlrd`` Python module. Excel 2007+ (``.xlsx``) files
2771-
can be read using either ``xlrd`` or ``openpyxl``.
2771+
can be read using either ``xlrd`` or ``openpyxl``. Binary Excel (``.xlsb``)
2772+
files can be read using ``pyxlsb``.
27722773
The :meth:`~DataFrame.to_excel` instance method is used for
27732774
saving a ``DataFrame`` to Excel. Generally the semantics are
27742775
similar to working with :ref:`csv<io.read_csv_table>` data.
@@ -3229,6 +3230,30 @@ OpenDocument spreadsheets match what can be done for `Excel files`_ using
32293230
Currently pandas only supports *reading* OpenDocument spreadsheets. Writing
32303231
is not implemented.
32313232

3233+
.. _io.xlsb:
3234+
3235+
Binary Excel (.xlsb) files
3236+
--------------------------
3237+
3238+
.. versionadded:: 1.0.0
3239+
3240+
The :func:`~pandas.read_excel` method can also read binary Excel files
3241+
using the ``pyxlsb`` module. The semantics and features for reading
3242+
binary Excel files mostly match what can be done for `Excel files`_ using
3243+
``engine='pyxlsb'``. ``pyxlsb`` does not recognize datetime types
3244+
in files and will return floats instead.
3245+
3246+
.. code-block:: python
3247+
3248+
# Returns a DataFrame
3249+
pd.read_excel('path_to_file.xlsb', engine='pyxlsb')
3250+
3251+
.. note::
3252+
3253+
Currently pandas only supports *reading* binary Excel files. Writing
3254+
is not implemented.
3255+
3256+
32323257
.. _io.clipboard:
32333258

32343259
Clipboard
@@ -4220,46 +4245,49 @@ Compression
42204245
all kinds of stores, not just tables. Two parameters are used to
42214246
control compression: ``complevel`` and ``complib``.
42224247

4223-
``complevel`` specifies if and how hard data is to be compressed.
4224-
``complevel=0`` and ``complevel=None`` disables
4225-
compression and ``0<complevel<10`` enables compression.
4226-
4227-
``complib`` specifies which compression library to use. If nothing is
4228-
specified the default library ``zlib`` is used. A
4229-
compression library usually optimizes for either good
4230-
compression rates or speed and the results will depend on
4231-
the type of data. Which type of
4232-
compression to choose depends on your specific needs and
4233-
data. The list of supported compression libraries:
4234-
4235-
- `zlib <https://zlib.net/>`_: The default compression library. A classic in terms of compression, achieves good compression rates but is somewhat slow.
4236-
- `lzo <https://www.oberhumer.com/opensource/lzo/>`_: Fast compression and decompression.
4237-
- `bzip2 <http://bzip.org/>`_: Good compression rates.
4238-
- `blosc <http://www.blosc.org/>`_: Fast compression and decompression.
4239-
4240-
Support for alternative blosc compressors:
4241-
4242-
- `blosc:blosclz <http://www.blosc.org/>`_ This is the
4243-
default compressor for ``blosc``
4244-
- `blosc:lz4
4245-
<https://fastcompression.blogspot.dk/p/lz4.html>`_:
4246-
A compact, very popular and fast compressor.
4247-
- `blosc:lz4hc
4248-
<https://fastcompression.blogspot.dk/p/lz4.html>`_:
4249-
A tweaked version of LZ4, produces better
4250-
compression ratios at the expense of speed.
4251-
- `blosc:snappy <https://google.github.io/snappy/>`_:
4252-
A popular compressor used in many places.
4253-
- `blosc:zlib <https://zlib.net/>`_: A classic;
4254-
somewhat slower than the previous ones, but
4255-
achieving better compression ratios.
4256-
- `blosc:zstd <https://facebook.github.io/zstd/>`_: An
4257-
extremely well balanced codec; it provides the best
4258-
compression ratios among the others above, and at
4259-
reasonably fast speed.
4260-
4261-
If ``complib`` is defined as something other than the
4262-
listed libraries a ``ValueError`` exception is issued.
4248+
* ``complevel`` specifies if and how hard data is to be compressed.
4249+
``complevel=0`` and ``complevel=None`` disables compression and
4250+
``0<complevel<10`` enables compression.
4251+
4252+
* ``complib`` specifies which compression library to use.
4253+
If nothing is specified the default library ``zlib`` is used. A
4254+
compression library usually optimizes for either good compression rates
4255+
or speed and the results will depend on the type of data. Which type of
4256+
compression to choose depends on your specific needs and data. The list
4257+
of supported compression libraries:
4258+
4259+
- `zlib <https://zlib.net/>`_: The default compression library.
4260+
A classic in terms of compression, achieves good compression
4261+
rates but is somewhat slow.
4262+
- `lzo <https://www.oberhumer.com/opensource/lzo/>`_: Fast
4263+
compression and decompression.
4264+
- `bzip2 <http://bzip.org/>`_: Good compression rates.
4265+
- `blosc <http://www.blosc.org/>`_: Fast compression and
4266+
decompression.
4267+
4268+
Support for alternative blosc compressors:
4269+
4270+
- `blosc:blosclz <http://www.blosc.org/>`_ This is the
4271+
default compressor for ``blosc``
4272+
- `blosc:lz4
4273+
<https://fastcompression.blogspot.dk/p/lz4.html>`_:
4274+
A compact, very popular and fast compressor.
4275+
- `blosc:lz4hc
4276+
<https://fastcompression.blogspot.dk/p/lz4.html>`_:
4277+
A tweaked version of LZ4, produces better
4278+
compression ratios at the expense of speed.
4279+
- `blosc:snappy <https://google.github.io/snappy/>`_:
4280+
A popular compressor used in many places.
4281+
- `blosc:zlib <https://zlib.net/>`_: A classic;
4282+
somewhat slower than the previous ones, but
4283+
achieving better compression ratios.
4284+
- `blosc:zstd <https://facebook.github.io/zstd/>`_: An
4285+
extremely well balanced codec; it provides the best
4286+
compression ratios among the others above, and at
4287+
reasonably fast speed.
4288+
4289+
If ``complib`` is defined as something other than the listed libraries a
4290+
``ValueError`` exception is issued.
42634291

42644292
.. note::
42654293

doc/source/whatsnew/v1.0.0.rst

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -215,7 +215,8 @@ Other enhancements
215215
- :meth:`Styler.format` added the ``na_rep`` parameter to help format the missing values (:issue:`21527`, :issue:`28358`)
216216
- Roundtripping DataFrames with nullable integer, string and period data types to parquet
217217
(:meth:`~DataFrame.to_parquet` / :func:`read_parquet`) using the `'pyarrow'` engine
218-
now preserve those data types with pyarrow >= 0.16.0 (:issue:`20612`, :issue:`28371`).
218+
now preserve those data types with pyarrow >= 1.0.0 (:issue:`20612`).
219+
- :func:`read_excel` now can read binary Excel (``.xlsb``) files by passing ``engine='pyxlsb'``. For more details and example usage, see the :ref:`Binary Excel files documentation <io.xlsb>`. Closes :issue:`8540`.
219220
- The ``partition_cols`` argument in :meth:`DataFrame.to_parquet` now accepts a string (:issue:`27117`)
220221
- :func:`pandas.read_json` now parses ``NaN``, ``Infinity`` and ``-Infinity`` (:issue:`12213`)
221222
- :func:`to_parquet` now appropriately handles the ``schema`` argument for user defined schemas in the pyarrow engine. (:issue:`30270`)

doc/source/whatsnew/v1.1.0.rst

Lines changed: 17 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -18,9 +18,21 @@ Enhancements
1818
Other enhancements
1919
^^^^^^^^^^^^^^^^^^
2020

21+
- :class:`Styler` may now render CSS more efficiently where multiple cells have the same styling (:issue:`30876`)
2122
-
2223
-
2324

25+
.. ---------------------------------------------------------------------------
26+
27+
.. _whatsnew_110.api.other:
28+
29+
Other API changes
30+
^^^^^^^^^^^^^^^^^
31+
32+
- :meth:`Series.describe` will now show distribution percentiles for ``datetime`` dtypes, statistics ``first`` and ``last``
33+
will now be ``min`` and ``max`` to match with numeric dtypes in :meth:`DataFrame.describe` (:issue:`30164`)
34+
-
35+
-
2436

2537
.. ---------------------------------------------------------------------------
2638
@@ -133,14 +145,15 @@ Plotting
133145
Groupby/resample/rolling
134146
^^^^^^^^^^^^^^^^^^^^^^^^
135147

136-
-
137-
-
148+
- Bug in :meth:`GroupBy.apply` raises ``ValueError`` when the ``by`` axis is not sorted and has duplicates and the applied ``func`` does not mutate passed in objects (:issue:`30667`)
138149

139150
Reshaping
140151
^^^^^^^^^
141152

142153
-
143154
- Bug in :meth:`DataFrame.pivot_table` when only MultiIndexed columns is set (:issue:`17038`)
155+
- Bug in :meth:`DataFrame.unstack` and :meth:`Series.unstack` can take tuple names in MultiIndexed data (:issue:`19966`)
156+
- Bug in :meth:`DataFrame.pivot_table` when ``margin`` is ``True`` and only ``column`` is defined (:issue:`31016`)
144157
- Fix incorrect error message in :meth:`DataFrame.pivot` when ``columns`` is set to ``None``. (:issue:`30924`)
145158
- Bug in :func:`crosstab` when inputs are two Series and have tuple names, the output will keep dummy MultiIndex as columns. (:issue:`18321`)
146159

@@ -160,7 +173,8 @@ ExtensionArray
160173

161174
Other
162175
^^^^^
163-
-
176+
- Appending a dictionary to a :class:`DataFrame` without passing ``ignore_index=True`` will raise ``TypeError: Can only append a dict if ignore_index=True``
177+
instead of ``TypeError: Can only append a Series if ignore_index=True or if the Series has a name`` (:issue:`30871`)
164178
-
165179

166180
.. ---------------------------------------------------------------------------

doc/sphinxext/announce.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
#!/usr/bin/env python
1+
#!/usr/bin/env python3
22
# -*- encoding:utf-8 -*-
33
"""
44
Script to generate contributor and pull request lists

pandas/_libs/index.pyx

Lines changed: 19 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -72,9 +72,10 @@ cdef class IndexEngine:
7272
self.over_size_threshold = n >= _SIZE_CUTOFF
7373
self.clear_mapping()
7474

75-
def __contains__(self, object val):
75+
def __contains__(self, val: object) -> bool:
76+
# We assume before we get here:
77+
# - val is hashable
7678
self._ensure_mapping_populated()
77-
hash(val)
7879
return val in self.mapping
7980

8081
cpdef get_value(self, ndarray arr, object key, object tz=None):
@@ -213,7 +214,8 @@ cdef class IndexEngine:
213214
return self.monotonic_dec == 1
214215

215216
cdef inline _do_monotonic_check(self):
216-
cdef object is_unique
217+
cdef:
218+
bint is_unique
217219
try:
218220
values = self._get_index_values()
219221
self.monotonic_inc, self.monotonic_dec, is_unique = \
@@ -236,10 +238,10 @@ cdef class IndexEngine:
236238
cdef _call_monotonic(self, values):
237239
return algos.is_monotonic(values, timelike=False)
238240

239-
def get_backfill_indexer(self, other, limit=None):
241+
def get_backfill_indexer(self, other: np.ndarray, limit=None) -> np.ndarray:
240242
return algos.backfill(self._get_index_values(), other, limit=limit)
241243

242-
def get_pad_indexer(self, other, limit=None):
244+
def get_pad_indexer(self, other: np.ndarray, limit=None) -> np.ndarray:
243245
return algos.pad(self._get_index_values(), other, limit=limit)
244246

245247
cdef _make_hash_table(self, Py_ssize_t n):
@@ -414,7 +416,9 @@ cdef class DatetimeEngine(Int64Engine):
414416
raise TypeError(scalar)
415417
return scalar.value
416418

417-
def __contains__(self, object val):
419+
def __contains__(self, val: object) -> bool:
420+
# We assume before we get here:
421+
# - val is hashable
418422
cdef:
419423
int64_t loc, conv
420424

@@ -477,13 +481,13 @@ cdef class DatetimeEngine(Int64Engine):
477481
values = np.asarray(values).view('i8')
478482
return self.mapping.lookup(values)
479483

480-
def get_pad_indexer(self, other, limit=None):
484+
def get_pad_indexer(self, other: np.ndarray, limit=None) -> np.ndarray:
481485
if other.dtype != self._get_box_dtype():
482486
return np.repeat(-1, len(other)).astype('i4')
483487
other = np.asarray(other).view('i8')
484488
return algos.pad(self._get_index_values(), other, limit=limit)
485489

486-
def get_backfill_indexer(self, other, limit=None):
490+
def get_backfill_indexer(self, other: np.ndarray, limit=None) -> np.ndarray:
487491
if other.dtype != self._get_box_dtype():
488492
return np.repeat(-1, len(other)).astype('i4')
489493
other = np.asarray(other).view('i8')
@@ -506,16 +510,13 @@ cdef class PeriodEngine(Int64Engine):
506510
cdef _get_index_values(self):
507511
return super(PeriodEngine, self).vgetter().view("i8")
508512

509-
cdef void _call_map_locations(self, values):
510-
# super(...) pattern doesn't seem to work with `cdef`
511-
Int64Engine._call_map_locations(self, values.view('i8'))
512-
513513
cdef _call_monotonic(self, values):
514514
# super(...) pattern doesn't seem to work with `cdef`
515515
return Int64Engine._call_monotonic(self, values.view('i8'))
516516

517517
def get_indexer(self, values):
518-
cdef ndarray[int64_t, ndim=1] ordinals
518+
cdef:
519+
ndarray[int64_t, ndim=1] ordinals
519520

520521
super(PeriodEngine, self)._ensure_mapping_populated()
521522

@@ -524,14 +525,14 @@ cdef class PeriodEngine(Int64Engine):
524525

525526
return self.mapping.lookup(ordinals)
526527

527-
def get_pad_indexer(self, other, limit=None):
528+
def get_pad_indexer(self, other: np.ndarray, limit=None) -> np.ndarray:
528529
freq = super(PeriodEngine, self).vgetter().freq
529530
ordinal = periodlib.extract_ordinals(other, freq)
530531

531532
return algos.pad(self._get_index_values(),
532533
np.asarray(ordinal), limit=limit)
533534

534-
def get_backfill_indexer(self, other, limit=None):
535+
def get_backfill_indexer(self, other: np.ndarray, limit=None) -> np.ndarray:
535536
freq = super(PeriodEngine, self).vgetter().freq
536537
ordinal = periodlib.extract_ordinals(other, freq)
537538

@@ -714,7 +715,9 @@ cdef class BaseMultiIndexCodesEngine:
714715

715716
return indexer
716717

717-
def __contains__(self, object val):
718+
def __contains__(self, val: object) -> bool:
719+
# We assume before we get here:
720+
# - val is hashable
718721
# Default __contains__ looks in the underlying mapping, which in this
719722
# case only contains integer representations.
720723
try:

pandas/_libs/index_class_helper.pxi.in

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -53,10 +53,7 @@ cdef class {{name}}Engine(IndexEngine):
5353
ndarray[{{ctype}}] values
5454
int count = 0
5555

56-
{{if name not in {'Float64', 'Float32'} }}
57-
if not util.is_integer_object(val):
58-
raise KeyError(val)
59-
{{endif}}
56+
self._check_type(val)
6057

6158
# A view is needed for some subclasses, such as PeriodEngine:
6259
values = self._get_index_values().view('{{dtype}}')

0 commit comments

Comments
 (0)