Skip to content

Commit bc65ff4

Browse files
committed
Merge remote-tracking branch 'upstream/master'
2 parents 0c90785 + 0ac3d98 commit bc65ff4

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

65 files changed

+2686
-1761
lines changed

asv_bench/benchmarks/algorithms.py

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,14 +18,17 @@ def setup(self):
1818
self.float = pd.Float64Index(np.random.randn(N).repeat(5))
1919

2020
# Convenience naming.
21-
self.checked_add = pd.core.nanops._checked_add_with_arr
21+
self.checked_add = pd.core.algorithms.checked_add_with_arr
2222

2323
self.arr = np.arange(1000000)
2424
self.arrpos = np.arange(1000000)
2525
self.arrneg = np.arange(-1000000, 0)
2626
self.arrmixed = np.array([1, -1]).repeat(500000)
2727
self.strings = tm.makeStringIndex(100000)
2828

29+
self.arr_nan = np.random.choice([True, False], size=1000000)
30+
self.arrmixed_nan = np.random.choice([True, False], size=1000000)
31+
2932
# match
3033
self.uniques = tm.makeStringIndex(1000).values
3134
self.all = self.uniques.repeat(10)
@@ -69,6 +72,16 @@ def time_add_overflow_neg_arr(self):
6972
def time_add_overflow_mixed_arr(self):
7073
self.checked_add(self.arr, self.arrmixed)
7174

75+
def time_add_overflow_first_arg_nan(self):
76+
self.checked_add(self.arr, self.arrmixed, arr_mask=self.arr_nan)
77+
78+
def time_add_overflow_second_arg_nan(self):
79+
self.checked_add(self.arr, self.arrmixed, b_mask=self.arrmixed_nan)
80+
81+
def time_add_overflow_both_arg_nan(self):
82+
self.checked_add(self.arr, self.arrmixed, arr_mask=self.arr_nan,
83+
b_mask=self.arrmixed_nan)
84+
7285

7386
class Hashing(object):
7487
goal_time = 0.2

asv_bench/benchmarks/io_bench.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -153,7 +153,7 @@ def setup(self, compression, engine):
153153
# The Python 2 C parser can't read bz2 from open files.
154154
raise NotImplementedError
155155
try:
156-
import boto
156+
import s3fs
157157
except ImportError:
158158
# Skip these benchmarks if `boto` is not installed.
159159
raise NotImplementedError

asv_bench/benchmarks/join_merge.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -302,12 +302,19 @@ def setup(self):
302302
self.df1 = self.df1.sort_values('time')
303303
self.df2 = self.df2.sort_values('time')
304304

305+
self.df1['time32'] = np.int32(self.df1.time)
306+
self.df2['time32'] = np.int32(self.df2.time)
307+
305308
self.df1a = self.df1[['time', 'value1']]
306309
self.df2a = self.df2[['time', 'value2']]
307310
self.df1b = self.df1[['time', 'key', 'value1']]
308311
self.df2b = self.df2[['time', 'key', 'value2']]
309312
self.df1c = self.df1[['time', 'key2', 'value1']]
310313
self.df2c = self.df2[['time', 'key2', 'value2']]
314+
self.df1d = self.df1[['time32', 'value1']]
315+
self.df2d = self.df2[['time32', 'value2']]
316+
self.df1e = self.df1[['time', 'key', 'key2', 'value1']]
317+
self.df2e = self.df2[['time', 'key', 'key2', 'value2']]
311318

312319
def time_noby(self):
313320
merge_asof(self.df1a, self.df2a, on='time')
@@ -318,6 +325,12 @@ def time_by_object(self):
318325
def time_by_int(self):
319326
merge_asof(self.df1c, self.df2c, on='time', by='key2')
320327

328+
def time_on_int32(self):
329+
merge_asof(self.df1d, self.df2d, on='time32')
330+
331+
def time_multiby(self):
332+
merge_asof(self.df1e, self.df2e, on='time', by=['key', 'key2'])
333+
321334

322335
#----------------------------------------------------------------------
323336
# data alignment

asv_bench/benchmarks/series_methods.py

Lines changed: 17 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,13 +8,28 @@ def setup(self):
88
self.dr = pd.date_range(
99
start=datetime(2015,10,26),
1010
end=datetime(2016,1,1),
11-
freq='10s'
12-
) # ~500k long
11+
freq='50s'
12+
) # ~100k long
1313

1414
def time_series_constructor_no_data_datetime_index(self):
1515
Series(data=None, index=self.dr)
1616

1717

18+
class series_constructor_dict_data_datetime_index(object):
19+
goal_time = 0.2
20+
21+
def setup(self):
22+
self.dr = pd.date_range(
23+
start=datetime(2015, 10, 26),
24+
end=datetime(2016, 1, 1),
25+
freq='50s'
26+
) # ~100k long
27+
self.data = {d: v for d, v in zip(self.dr, range(len(self.dr)))}
28+
29+
def time_series_constructor_no_data_datetime_index(self):
30+
Series(data=self.data, index=self.dr)
31+
32+
1833
class series_isin_int64(object):
1934
goal_time = 0.2
2035

ci/requirements-2.7-64.run

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ sqlalchemy
1111
lxml=3.2.1
1212
scipy
1313
xlsxwriter
14-
boto
14+
s3fs
1515
bottleneck
1616
html5lib
1717
beautiful-soup

ci/requirements-2.7.run

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ sqlalchemy=0.9.6
1111
lxml=3.2.1
1212
scipy
1313
xlsxwriter=0.4.6
14-
boto=2.36.0
14+
s3fs
1515
bottleneck
1616
psycopg2=2.5.2
1717
patsy

ci/requirements-2.7_SLOW.run

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ numexpr
1313
pytables
1414
sqlalchemy
1515
lxml
16-
boto
16+
s3fs
1717
bottleneck
1818
psycopg2
1919
pymysql

ci/requirements-3.5.run

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ sqlalchemy
1717
pymysql
1818
psycopg2
1919
xarray
20-
boto
20+
s3fs
2121

2222
# incompat with conda ATM
2323
# beautiful-soup

ci/requirements-3.5_OSX.run

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ matplotlib
1212
jinja2
1313
bottleneck
1414
xarray
15-
boto
15+
s3fs
1616

1717
# incompat with conda ATM
1818
# beautiful-soup

doc/source/install.rst

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -262,7 +262,7 @@ Optional Dependencies
262262
* `XlsxWriter <https://pypi.python.org/pypi/XlsxWriter>`__: Alternative Excel writer
263263

264264
* `Jinja2 <http://jinja.pocoo.org/>`__: Template engine for conditional HTML formatting.
265-
* `boto <https://pypi.python.org/pypi/boto>`__: necessary for Amazon S3 access.
265+
* `s3fs <http://s3fs.readthedocs.io/>`__: necessary for Amazon S3 access (s3fs >= 0.0.7).
266266
* `blosc <https://pypi.python.org/pypi/blosc>`__: for msgpack compression using ``blosc``
267267
* One of `PyQt4
268268
<http://www.riverbankcomputing.com/software/pyqt/download>`__, `PySide

doc/source/io.rst

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1487,6 +1487,23 @@ options include:
14871487
Specifying any of the above options will produce a ``ParserWarning`` unless the
14881488
python engine is selected explicitly using ``engine='python'``.
14891489

1490+
Reading remote files
1491+
''''''''''''''''''''
1492+
1493+
You can pass in a URL to a CSV file:
1494+
1495+
.. code-block:: python
1496+
1497+
df = pd.read_csv('https://download.bls.gov/pub/time.series/cu/cu.item',
1498+
sep='\t')
1499+
1500+
S3 URLs are handled as well:
1501+
1502+
.. code-block:: python
1503+
1504+
df = pd.read_csv('s3://pandas-test/tips.csv')
1505+
1506+
14901507
Writing out Data
14911508
''''''''''''''''
14921509

doc/source/whatsnew/v0.19.2.txt

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,13 +22,15 @@ Performance Improvements
2222
~~~~~~~~~~~~~~~~~~~~~~~~
2323

2424
- Improved performance of ``.replace()`` (:issue:`12745`)
25+
- Improved performance ``Series`` creation with a datetime index and dictionary data (:issue:`14894`)
2526

2627
.. _whatsnew_0192.enhancements.other:
2728

2829
Other Enhancements
2930
~~~~~~~~~~~~~~~~~~
3031

3132
- ``pd.merge_asof()`` gained ``left_index``/``right_index`` and ``left_by``/``right_by`` arguments (:issue:`14253`)
33+
- ``pd.merge_asof()`` can take multiple columns in ``by`` parameter and has specialized dtypes for better performace (:issue:`13936`)
3234

3335

3436

@@ -39,10 +41,13 @@ Bug Fixes
3941

4042
- Compat with ``dateutil==2.6.0``; segfault reported in the testing suite (:issue:`14621`)
4143
- Allow ``nanoseconds`` in ``Timestamp.replace`` as a kwarg (:issue:`14621`)
44+
- Bug in ``pd.read_csv`` in which aliasing was being done for ``na_values`` when passed in as a dictionary (:issue:`14203`)
45+
- Bug in ``pd.read_csv`` in which column indices for a dict-like ``na_values`` were not being respected (:issue:`14203`)
4246
- Bug in ``pd.read_csv`` where reading files fails, if the number of headers is equal to the number of lines in the file (:issue:`14515`)
4347
- Bug in ``pd.read_csv`` for the Python engine in which an unhelpful error message was being raised when multi-char delimiters were not being respected with quotes (:issue:`14582`)
4448
- Fix bugs (:issue:`14734`, :issue:`13654`) in ``pd.read_sas`` and ``pandas.io.sas.sas7bdat.SAS7BDATReader`` that caused problems when reading a SAS file incrementally.
4549
- Bug in ``pd.read_csv`` for the Python engine in which an unhelpful error message was being raised when ``skipfooter`` was not being respected by Python's CSV library (:issue:`13879`)
50+
- Bug in ``.fillna()`` in which timezone aware datetime64 values were incorrectly rounded (:issue:`14872`)
4651

4752

4853
- Bug in ``.groupby(..., sort=True)`` of a non-lexsorted MultiIndex when grouping with multiple levels (:issue:`14776`)

doc/source/whatsnew/v0.20.0.txt

Lines changed: 102 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,27 @@ Strings passed to ``DataFrame.groupby()`` as the ``by`` parameter may now refere
6464

6565
df.groupby(['second', 'A']).sum()
6666

67+
.. _whatsnew_0200.enhancements.compressed_urls:
68+
69+
Better support for compressed URLs in ``read_csv``
70+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
71+
72+
The compression code was refactored (:issue:`12688`). As a result, reading
73+
dataframes from URLs in :func:`read_csv` or :func:`read_table` now supports
74+
additional compression methods: ``xz``, ``bz2``, and ``zip`` (:issue:`14570`).
75+
Previously, only ``gzip`` compression was supported. By default, compression of
76+
URLs and paths are now both inferred using their file extensions. Additionally,
77+
support for bz2 compression in the python 2 c-engine improved (:issue:`14874`).
78+
79+
.. ipython:: python
80+
url = 'https://github.com/{repo}/raw/{branch}/{path}'.format(
81+
repo = 'pandas-dev/pandas',
82+
branch = 'master',
83+
path = 'pandas/io/tests/parser/data/salaries.csv.bz2',
84+
)
85+
df = pd.read_table(url, compression='infer') # default, infer compression
86+
df = pd.read_table(url, compression='bz2') # explicitly specify compression
87+
df.head(2)
6788

6889
.. _whatsnew_0200.enhancements.other:
6990

@@ -85,14 +106,92 @@ Other enhancements
85106
- ``pd.Series.interpolate`` now supports timedelta as an index type with ``method='time'`` (:issue:`6424`)
86107
- ``pandas.io.json.json_normalize()`` gained the option ``errors='ignore'|'raise'``; the default is ``errors='raise'`` which is backward compatible. (:issue:`14583`)
87108

109+
- ``.select_dtypes()`` now allows `datetimetz` to generically select datetimes with tz (:issue:`14910`)
110+
88111

89112
.. _whatsnew_0200.api_breaking:
90113

91114
Backwards incompatible API changes
92115
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
93116

94-
.. _whatsnew_0200.api:
117+
.. _whatsnew.api_breaking.index_map
118+
119+
Map on Index types now return other Index types
120+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
121+
122+
- ``map`` on an ``Index`` now returns an ``Index``, not a numpy array (:issue:`12766`)
123+
124+
.. ipython:: python
125+
126+
idx = Index([1, 2])
127+
idx
128+
mi = MultiIndex.from_tuples([(1, 2), (2, 4)])
129+
mi
130+
131+
Previous Behavior:
132+
133+
.. code-block:: ipython
134+
135+
In [5]: idx.map(lambda x: x * 2)
136+
Out[5]: array([2, 4])
137+
138+
In [6]: idx.map(lambda x: (x, x * 2))
139+
Out[6]: array([(1, 2), (2, 4)], dtype=object)
140+
141+
In [7]: mi.map(lambda x: x)
142+
Out[7]: array([(1, 2), (2, 4)], dtype=object)
143+
144+
In [8]: mi.map(lambda x: x[0])
145+
Out[8]: array([1, 2])
146+
147+
New Behavior:
148+
149+
.. ipython:: python
150+
151+
idx.map(lambda x: x * 2)
152+
153+
idx.map(lambda x: (x, x * 2))
95154

155+
mi.map(lambda x: x)
156+
157+
mi.map(lambda x: x[0])
158+
159+
160+
- ``map`` on a Series with datetime64 values may return int64 dtypes rather than int32
161+
162+
.. ipython:: python
163+
164+
s = Series(date_range('2011-01-02T00:00', '2011-01-02T02:00', freq='H').tz_localize('Asia/Tokyo'))
165+
s
166+
167+
Previous Behavior:
168+
169+
.. code-block:: ipython
170+
171+
In [9]: s.map(lambda x: x.hour)
172+
Out[9]:
173+
0 0
174+
1 1
175+
2 2
176+
dtype: int32
177+
178+
179+
New Behavior:
180+
181+
.. ipython:: python
182+
183+
s.map(lambda x: x.hour)
184+
185+
.. _whatsnew_0200.s3:
186+
187+
S3 File Handling
188+
^^^^^^^^^^^^^^^^
189+
190+
pandas now uses `s3fs <http://s3fs.readthedocs.io/>`_ for handling S3 connections. This shouldn't break
191+
any code. However, since s3fs is not a required dependency, you will need to install it separately (like boto
192+
in prior versions of pandas) (:issue:`11915`).
193+
194+
.. _whatsnew_0200.api:
96195

97196
- ``CParserError`` has been renamed to ``ParserError`` in ``pd.read_csv`` and will be removed in the future (:issue:`12665`)
98197
- ``SparseArray.cumsum()`` and ``SparseSeries.cumsum()`` will now always return ``SparseArray`` and ``SparseSeries`` respectively (:issue:`12855`)
@@ -103,7 +202,6 @@ Backwards incompatible API changes
103202
Other API Changes
104203
^^^^^^^^^^^^^^^^^
105204

106-
107205
.. _whatsnew_0200.deprecations:
108206

109207
Deprecations
@@ -144,6 +242,8 @@ Performance Improvements
144242
Bug Fixes
145243
~~~~~~~~~
146244

245+
- Bug in ``TimedeltaIndex`` addition where overflow was being allowed without error (:issue:`14816`)
246+
- Bug in ``DataFrame`` construction in which unsigned 64-bit integer elements were being converted to objects (:issue:`14881`)
147247
- Bug in ``astype()`` where ``inf`` values were incorrectly converted to integers. Now raises error now with ``astype()`` for Series and DataFrames (:issue:`14265`)
148248

149249

@@ -158,5 +258,4 @@ Bug Fixes
158258

159259

160260

161-
162261
- Require at least 0.23 version of cython to avoid problems with character encodings (:issue:`14699`)

0 commit comments

Comments
 (0)