From 93a67dbbb6f8adad0061eaa21a046ba44e297957 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Wed, 29 May 2019 11:17:24 +0100 Subject: [PATCH 01/51] CI: unary operator expected error in log files (#26547) --- ci/setup_env.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ci/setup_env.sh b/ci/setup_env.sh index e2667558a63d7..8f73bb228e2bd 100755 --- a/ci/setup_env.sh +++ b/ci/setup_env.sh @@ -118,12 +118,12 @@ echo "conda list" conda list # Install DB for Linux -if [ ${TRAVIS_OS_NAME} == "linux" ]; then +if [ "${TRAVIS_OS_NAME}" == "linux" ]; then echo "installing dbs" mysql -e 'create database pandas_nosetest;' psql -c 'create database pandas_nosetest;' -U postgres else - echo "not using dbs on non-linux" + echo "not using dbs on non-linux Travis builds or Azure Pipelines" fi echo "done" From a91da0c94e541217865cdf52b9f6ea694f0493d3 Mon Sep 17 00:00:00 2001 From: Vaibhav Vishal Date: Wed, 29 May 2019 18:10:39 +0530 Subject: [PATCH 02/51] Fix type annotations in pandas.core.indexes.period (#26518) --- mypy.ini | 3 --- pandas/core/indexes/period.py | 4 ++-- 2 files changed, 2 insertions(+), 5 deletions(-) diff --git a/mypy.ini b/mypy.ini index 3df8fd13a2a75..eea6a3b551677 100644 --- a/mypy.ini +++ b/mypy.ini @@ -7,6 +7,3 @@ ignore_errors=True [mypy-pandas.core.indexes.datetimelike] ignore_errors=True - -[mypy-pandas.core.indexes.period] -ignore_errors=True diff --git a/pandas/core/indexes/period.py b/pandas/core/indexes/period.py index 044951ceda502..64272431cf703 100644 --- a/pandas/core/indexes/period.py +++ b/pandas/core/indexes/period.py @@ -80,7 +80,7 @@ class PeriodIndex(DatetimeIndexOpsMixin, Int64Index, PeriodDelegateMixin): Parameters ---------- - data : array-like (1-dimensional), optional + data : array-like (1d integer np.ndarray or PeriodArray), optional Optional period-like data to construct index with copy : bool Make a copy of input ndarray @@ -168,7 +168,7 @@ class PeriodIndex(DatetimeIndexOpsMixin, Int64Index, PeriodDelegateMixin): _is_numeric_dtype = False _infer_as_myclass = True - _data = None # type: PeriodArray + _data = None _engine_type = libindex.PeriodEngine From 5488636266bcd78282c66d551b452ab38da17bd0 Mon Sep 17 00:00:00 2001 From: William Ayd Date: Wed, 29 May 2019 18:02:00 -0700 Subject: [PATCH 03/51] Fixturize Test Excel (#26543) --- pandas/tests/io/test_excel.py | 614 +++++++++++++++------------------- 1 file changed, 272 insertions(+), 342 deletions(-) diff --git a/pandas/tests/io/test_excel.py b/pandas/tests/io/test_excel.py index 100de227aa97c..6db3d1d4ab34d 100644 --- a/pandas/tests/io/test_excel.py +++ b/pandas/tests/io/test_excel.py @@ -22,7 +22,7 @@ from pandas.io.common import URLError from pandas.io.excel import ( ExcelFile, ExcelWriter, _OpenpyxlWriter, _XlsxWriter, _XlwtWriter, - read_excel, register_writer) + register_writer) from pandas.io.formats.excel import ExcelFormatter from pandas.io.parsers import read_csv @@ -53,7 +53,6 @@ class SharedItems: @pytest.fixture(autouse=True) def setup_method(self, datapath): - self.dirpath = datapath("io", "data") self.frame = _frame.copy() self.frame2 = _frame2.copy() self.tsframe = _tsframe.copy() @@ -65,135 +64,85 @@ class ReadingTestsBase(SharedItems): # This is based on ExcelWriterBase @pytest.fixture(autouse=True, params=['xlrd', None]) - def set_engine(self, request): - func_name = "get_exceldf" - old_func = getattr(self, func_name) - new_func = partial(old_func, engine=request.param) - setattr(self, func_name, new_func) - yield - setattr(self, func_name, old_func) - - def get_csv_refdf(self, basename): + def cd_and_set_engine(self, request, datapath, monkeypatch): """ - Obtain the reference data from read_csv with the Python engine. - - Parameters - ---------- - - basename : str - File base name, excluding file extension. - - Returns - ------- - - dfref : DataFrame + Change directory and set engine for read_excel calls. """ - pref = os.path.join(self.dirpath, basename + '.csv') - dfref = read_csv(pref, index_col=0, parse_dates=True, engine='python') - return dfref + func = partial(pd.read_excel, engine=request.param) + monkeypatch.chdir(datapath("io", "data")) + monkeypatch.setattr(pd, 'read_excel', func) - def get_excelfile(self, basename, ext): + @pytest.fixture + def df_ref(self): """ - Return test data ExcelFile instance. - - Parameters - ---------- - - basename : str - File base name, excluding file extension. - - Returns - ------- - - excel : io.excel.ExcelFile - """ - return ExcelFile(os.path.join(self.dirpath, basename + ext)) - - def get_exceldf(self, basename, ext, *args, **kwds): - """ - Return test data DataFrame. - - Parameters - ---------- - - basename : str - File base name, excluding file extension. - - Returns - ------- - - df : DataFrame + Obtain the reference data from read_csv with the Python engine. """ - pth = os.path.join(self.dirpath, basename + ext) - return read_excel(pth, *args, **kwds) + df_ref = read_csv('test1.csv', index_col=0, + parse_dates=True, engine='python') + return df_ref @td.skip_if_no("xlrd", "1.0.1") # see gh-22682 - def test_usecols_int(self, ext): - - df_ref = self.get_csv_refdf("test1") + def test_usecols_int(self, ext, df_ref): df_ref = df_ref.reindex(columns=["A", "B", "C"]) # usecols as int with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): with ignore_xlrd_time_clock_warning(): - df1 = self.get_exceldf("test1", ext, "Sheet1", - index_col=0, usecols=3) + df1 = pd.read_excel("test1" + ext, "Sheet1", + index_col=0, usecols=3) # usecols as int with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): with ignore_xlrd_time_clock_warning(): - df2 = self.get_exceldf("test1", ext, "Sheet2", skiprows=[1], - index_col=0, usecols=3) + df2 = pd.read_excel("test1" + ext, "Sheet2", skiprows=[1], + index_col=0, usecols=3) # TODO add index to xls file) tm.assert_frame_equal(df1, df_ref, check_names=False) tm.assert_frame_equal(df2, df_ref, check_names=False) @td.skip_if_no('xlrd', '1.0.1') # GH-22682 - def test_usecols_list(self, ext): + def test_usecols_list(self, ext, df_ref): - dfref = self.get_csv_refdf('test1') - dfref = dfref.reindex(columns=['B', 'C']) - df1 = self.get_exceldf('test1', ext, 'Sheet1', index_col=0, - usecols=[0, 2, 3]) - df2 = self.get_exceldf('test1', ext, 'Sheet2', skiprows=[1], - index_col=0, usecols=[0, 2, 3]) + df_ref = df_ref.reindex(columns=['B', 'C']) + df1 = pd.read_excel('test1' + ext, 'Sheet1', index_col=0, + usecols=[0, 2, 3]) + df2 = pd.read_excel('test1' + ext, 'Sheet2', skiprows=[1], + index_col=0, usecols=[0, 2, 3]) # TODO add index to xls file) - tm.assert_frame_equal(df1, dfref, check_names=False) - tm.assert_frame_equal(df2, dfref, check_names=False) + tm.assert_frame_equal(df1, df_ref, check_names=False) + tm.assert_frame_equal(df2, df_ref, check_names=False) @td.skip_if_no('xlrd', '1.0.1') # GH-22682 - def test_usecols_str(self, ext): + def test_usecols_str(self, ext, df_ref): - dfref = self.get_csv_refdf('test1') - - df1 = dfref.reindex(columns=['A', 'B', 'C']) - df2 = self.get_exceldf('test1', ext, 'Sheet1', index_col=0, - usecols='A:D') - df3 = self.get_exceldf('test1', ext, 'Sheet2', skiprows=[1], - index_col=0, usecols='A:D') + df1 = df_ref.reindex(columns=['A', 'B', 'C']) + df2 = pd.read_excel('test1' + ext, 'Sheet1', index_col=0, + usecols='A:D') + df3 = pd.read_excel('test1' + ext, 'Sheet2', skiprows=[1], + index_col=0, usecols='A:D') # TODO add index to xls, read xls ignores index name ? tm.assert_frame_equal(df2, df1, check_names=False) tm.assert_frame_equal(df3, df1, check_names=False) - df1 = dfref.reindex(columns=['B', 'C']) - df2 = self.get_exceldf('test1', ext, 'Sheet1', index_col=0, - usecols='A,C,D') - df3 = self.get_exceldf('test1', ext, 'Sheet2', skiprows=[1], - index_col=0, usecols='A,C,D') + df1 = df_ref.reindex(columns=['B', 'C']) + df2 = pd.read_excel('test1' + ext, 'Sheet1', index_col=0, + usecols='A,C,D') + df3 = pd.read_excel('test1' + ext, 'Sheet2', skiprows=[1], + index_col=0, usecols='A,C,D') # TODO add index to xls file tm.assert_frame_equal(df2, df1, check_names=False) tm.assert_frame_equal(df3, df1, check_names=False) - df1 = dfref.reindex(columns=['B', 'C']) - df2 = self.get_exceldf('test1', ext, 'Sheet1', index_col=0, - usecols='A,C:D') - df3 = self.get_exceldf('test1', ext, 'Sheet2', skiprows=[1], - index_col=0, usecols='A,C:D') + df1 = df_ref.reindex(columns=['B', 'C']) + df2 = pd.read_excel('test1' + ext, 'Sheet1', index_col=0, + usecols='A,C:D') + df3 = pd.read_excel('test1' + ext, 'Sheet2', skiprows=[1], + index_col=0, usecols='A,C:D') tm.assert_frame_equal(df2, df1, check_names=False) tm.assert_frame_equal(df3, df1, check_names=False) @@ -202,50 +151,52 @@ def test_usecols_str(self, ext): [1, 0, 3], [1, 3, 0], [3, 0, 1], [3, 1, 0], ]) - def test_usecols_diff_positional_int_columns_order(self, ext, usecols): - expected = self.get_csv_refdf("test1")[["A", "C"]] - result = self.get_exceldf("test1", ext, "Sheet1", - index_col=0, usecols=usecols) + def test_usecols_diff_positional_int_columns_order( + self, ext, usecols, df_ref): + expected = df_ref[["A", "C"]] + result = pd.read_excel("test1" + ext, "Sheet1", + index_col=0, usecols=usecols) tm.assert_frame_equal(result, expected, check_names=False) @pytest.mark.parametrize("usecols", [ ["B", "D"], ["D", "B"] ]) - def test_usecols_diff_positional_str_columns_order(self, ext, usecols): - expected = self.get_csv_refdf("test1")[["B", "D"]] + def test_usecols_diff_positional_str_columns_order( + self, ext, usecols, df_ref): + expected = df_ref[["B", "D"]] expected.index = range(len(expected)) - result = self.get_exceldf("test1", ext, "Sheet1", usecols=usecols) + result = pd.read_excel("test1" + ext, "Sheet1", usecols=usecols) tm.assert_frame_equal(result, expected, check_names=False) - def test_read_excel_without_slicing(self, ext): - expected = self.get_csv_refdf("test1") - result = self.get_exceldf("test1", ext, "Sheet1", index_col=0) + def test_read_excel_without_slicing(self, ext, df_ref): + expected = df_ref + result = pd.read_excel("test1" + ext, "Sheet1", index_col=0) tm.assert_frame_equal(result, expected, check_names=False) - def test_usecols_excel_range_str(self, ext): - expected = self.get_csv_refdf("test1")[["C", "D"]] - result = self.get_exceldf("test1", ext, "Sheet1", - index_col=0, usecols="A,D:E") + def test_usecols_excel_range_str(self, ext, df_ref): + expected = df_ref[["C", "D"]] + result = pd.read_excel("test1" + ext, "Sheet1", + index_col=0, usecols="A,D:E") tm.assert_frame_equal(result, expected, check_names=False) def test_usecols_excel_range_str_invalid(self, ext): msg = "Invalid column name: E1" with pytest.raises(ValueError, match=msg): - self.get_exceldf("test1", ext, "Sheet1", usecols="D:E1") + pd.read_excel("test1" + ext, "Sheet1", usecols="D:E1") def test_index_col_label_error(self, ext): msg = "list indices must be integers.*, not str" with pytest.raises(TypeError, match=msg): - self.get_exceldf("test1", ext, "Sheet1", index_col=["A"], - usecols=["A", "C"]) + pd.read_excel("test1" + ext, "Sheet1", index_col=["A"], + usecols=["A", "C"]) def test_index_col_empty(self, ext): # see gh-9208 - result = self.get_exceldf("test1", ext, "Sheet3", - index_col=["A", "B", "C"]) + result = pd.read_excel("test1" + ext, "Sheet3", + index_col=["A", "B", "C"]) expected = DataFrame(columns=["D", "E", "F"], index=MultiIndex(levels=[[]] * 3, codes=[[]] * 3, @@ -255,8 +206,7 @@ def test_index_col_empty(self, ext): @pytest.mark.parametrize("index_col", [None, 2]) def test_index_col_with_unnamed(self, ext, index_col): # see gh-18792 - result = self.get_exceldf("test1", ext, "Sheet4", - index_col=index_col) + result = pd.read_excel("test1" + ext, "Sheet4", index_col=index_col) expected = DataFrame([["i1", "a", "x"], ["i2", "b", "y"]], columns=["Unnamed: 0", "col1", "col2"]) if index_col: @@ -269,54 +219,54 @@ def test_usecols_pass_non_existent_column(self, ext): "columns expected but not found: " + r"\['E'\]") with pytest.raises(ValueError, match=msg): - self.get_exceldf("test1", ext, usecols=["E"]) + pd.read_excel("test1" + ext, usecols=["E"]) def test_usecols_wrong_type(self, ext): msg = ("'usecols' must either be list-like of " "all strings, all unicode, all integers or a callable.") with pytest.raises(ValueError, match=msg): - self.get_exceldf("test1", ext, usecols=["E1", 0]) + pd.read_excel("test1" + ext, usecols=["E1", 0]) def test_excel_stop_iterator(self, ext): - parsed = self.get_exceldf('test2', ext, 'Sheet1') + parsed = pd.read_excel('test2' + ext, 'Sheet1') expected = DataFrame([['aaaa', 'bbbbb']], columns=['Test', 'Test1']) tm.assert_frame_equal(parsed, expected) def test_excel_cell_error_na(self, ext): - parsed = self.get_exceldf('test3', ext, 'Sheet1') + parsed = pd.read_excel('test3' + ext, 'Sheet1') expected = DataFrame([[np.nan]], columns=['Test']) tm.assert_frame_equal(parsed, expected) def test_excel_passes_na(self, ext): - excel = self.get_excelfile('test4', ext) + excel = ExcelFile('test4' + ext) - parsed = read_excel(excel, 'Sheet1', keep_default_na=False, - na_values=['apple']) + parsed = pd.read_excel(excel, 'Sheet1', keep_default_na=False, + na_values=['apple']) expected = DataFrame([['NA'], [1], ['NA'], [np.nan], ['rabbit']], columns=['Test']) tm.assert_frame_equal(parsed, expected) - parsed = read_excel(excel, 'Sheet1', keep_default_na=True, - na_values=['apple']) + parsed = pd.read_excel(excel, 'Sheet1', keep_default_na=True, + na_values=['apple']) expected = DataFrame([[np.nan], [1], [np.nan], [np.nan], ['rabbit']], columns=['Test']) tm.assert_frame_equal(parsed, expected) # 13967 - excel = self.get_excelfile('test5', ext) + excel = ExcelFile('test5' + ext) - parsed = read_excel(excel, 'Sheet1', keep_default_na=False, - na_values=['apple']) + parsed = pd.read_excel(excel, 'Sheet1', keep_default_na=False, + na_values=['apple']) expected = DataFrame([['1.#QNAN'], [1], ['nan'], [np.nan], ['rabbit']], columns=['Test']) tm.assert_frame_equal(parsed, expected) - parsed = read_excel(excel, 'Sheet1', keep_default_na=True, - na_values=['apple']) + parsed = pd.read_excel(excel, 'Sheet1', keep_default_na=True, + na_values=['apple']) expected = DataFrame([[np.nan], [1], [np.nan], [np.nan], ['rabbit']], columns=['Test']) tm.assert_frame_equal(parsed, expected) @@ -325,34 +275,33 @@ def test_excel_passes_na(self, ext): @pytest.mark.parametrize('arg', ['sheet', 'sheetname', 'parse_cols']) def test_unexpected_kwargs_raises(self, ext, arg): # gh-17964 - excel = self.get_excelfile('test1', ext) + excel = ExcelFile('test1' + ext) kwarg = {arg: 'Sheet1'} msg = "unexpected keyword argument `{}`".format(arg) with pytest.raises(TypeError, match=msg): - read_excel(excel, **kwarg) + pd.read_excel(excel, **kwarg) @td.skip_if_no('xlrd', '1.0.1') # GH-22682 - def test_excel_table_sheet_by_index(self, ext): + def test_excel_table_sheet_by_index(self, ext, df_ref): - excel = self.get_excelfile('test1', ext) - dfref = self.get_csv_refdf('test1') + excel = ExcelFile('test1' + ext) - df1 = read_excel(excel, 0, index_col=0) - df2 = read_excel(excel, 1, skiprows=[1], index_col=0) - tm.assert_frame_equal(df1, dfref, check_names=False) - tm.assert_frame_equal(df2, dfref, check_names=False) + df1 = pd.read_excel(excel, 0, index_col=0) + df2 = pd.read_excel(excel, 1, skiprows=[1], index_col=0) + tm.assert_frame_equal(df1, df_ref, check_names=False) + tm.assert_frame_equal(df2, df_ref, check_names=False) df1 = excel.parse(0, index_col=0) df2 = excel.parse(1, skiprows=[1], index_col=0) - tm.assert_frame_equal(df1, dfref, check_names=False) - tm.assert_frame_equal(df2, dfref, check_names=False) + tm.assert_frame_equal(df1, df_ref, check_names=False) + tm.assert_frame_equal(df2, df_ref, check_names=False) - df3 = read_excel(excel, 0, index_col=0, skipfooter=1) + df3 = pd.read_excel(excel, 0, index_col=0, skipfooter=1) tm.assert_frame_equal(df3, df1.iloc[:-1]) with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - df4 = read_excel(excel, 0, index_col=0, skip_footer=1) + df4 = pd.read_excel(excel, 0, index_col=0, skip_footer=1) tm.assert_frame_equal(df3, df4) df3 = excel.parse(0, index_col=0, skipfooter=1) @@ -360,21 +309,18 @@ def test_excel_table_sheet_by_index(self, ext): import xlrd with pytest.raises(xlrd.XLRDError): - read_excel(excel, 'asdf') - - def test_excel_table(self, ext): + pd.read_excel(excel, 'asdf') - dfref = self.get_csv_refdf('test1') + def test_excel_table(self, ext, df_ref): - df1 = self.get_exceldf('test1', ext, 'Sheet1', index_col=0) - df2 = self.get_exceldf('test1', ext, 'Sheet2', skiprows=[1], - index_col=0) + df1 = pd.read_excel('test1' + ext, 'Sheet1', index_col=0) + df2 = pd.read_excel('test1' + ext, 'Sheet2', skiprows=[1], + index_col=0) # TODO add index to file - tm.assert_frame_equal(df1, dfref, check_names=False) - tm.assert_frame_equal(df2, dfref, check_names=False) + tm.assert_frame_equal(df1, df_ref, check_names=False) + tm.assert_frame_equal(df2, df_ref, check_names=False) - df3 = self.get_exceldf('test1', ext, 'Sheet1', index_col=0, - skipfooter=1) + df3 = pd.read_excel('test1' + ext, 'Sheet1', index_col=0, skipfooter=1) tm.assert_frame_equal(df3, df1.iloc[:-1]) def test_reader_special_dtypes(self, ext): @@ -393,32 +339,32 @@ def test_reader_special_dtypes(self, ext): basename = 'test_types' # should read in correctly and infer types - actual = self.get_exceldf(basename, ext, 'Sheet1') + actual = pd.read_excel(basename + ext, 'Sheet1') tm.assert_frame_equal(actual, expected) # if not coercing number, then int comes in as float float_expected = expected.copy() float_expected["IntCol"] = float_expected["IntCol"].astype(float) float_expected.loc[float_expected.index[1], "Str2Col"] = 3.0 - actual = self.get_exceldf(basename, ext, 'Sheet1', convert_float=False) + actual = pd.read_excel(basename + ext, 'Sheet1', convert_float=False) tm.assert_frame_equal(actual, float_expected) # check setting Index (assuming xls and xlsx are the same here) for icol, name in enumerate(expected.columns): - actual = self.get_exceldf(basename, ext, 'Sheet1', index_col=icol) + actual = pd.read_excel(basename + ext, 'Sheet1', index_col=icol) exp = expected.set_index(name) tm.assert_frame_equal(actual, exp) # convert_float and converters should be different but both accepted expected["StrCol"] = expected["StrCol"].apply(str) - actual = self.get_exceldf( - basename, ext, 'Sheet1', converters={"StrCol": str}) + actual = pd.read_excel(basename + ext, 'Sheet1', + converters={"StrCol": str}) tm.assert_frame_equal(actual, expected) no_convert_float = float_expected.copy() no_convert_float["StrCol"] = no_convert_float["StrCol"].apply(str) - actual = self.get_exceldf(basename, ext, 'Sheet1', convert_float=False, - converters={"StrCol": str}) + actual = pd.read_excel(basename + ext, 'Sheet1', convert_float=False, + converters={"StrCol": str}) tm.assert_frame_equal(actual, no_convert_float) # GH8212 - support for converters and missing values @@ -441,14 +387,13 @@ def test_reader_converters(self, ext): # should read in correctly and set types of single cells (not array # dtypes) - actual = self.get_exceldf(basename, ext, 'Sheet1', - converters=converters) + actual = pd.read_excel(basename + ext, 'Sheet1', converters=converters) tm.assert_frame_equal(actual, expected) def test_reader_dtype(self, ext): # GH 8212 basename = 'testdtype' - actual = self.get_exceldf(basename, ext) + actual = pd.read_excel(basename + ext) expected = DataFrame({ 'a': [1, 2, 3, 4], @@ -459,10 +404,10 @@ def test_reader_dtype(self, ext): tm.assert_frame_equal(actual, expected) - actual = self.get_exceldf(basename, ext, - dtype={'a': 'float64', - 'b': 'float32', - 'c': str}) + actual = pd.read_excel(basename + ext, + dtype={'a': 'float64', + 'b': 'float32', + 'c': str}) expected['a'] = expected['a'].astype('float64') expected['b'] = expected['b'].astype('float32') @@ -470,7 +415,7 @@ def test_reader_dtype(self, ext): tm.assert_frame_equal(actual, expected) with pytest.raises(ValueError): - self.get_exceldf(basename, ext, dtype={'d': 'int64'}) + pd.read_excel(basename + ext, dtype={'d': 'int64'}) @pytest.mark.parametrize("dtype,expected", [ (None, @@ -496,7 +441,7 @@ def test_reader_dtype_str(self, ext, dtype, expected): # see gh-20377 basename = "testdtype" - actual = self.get_exceldf(basename, ext, dtype=dtype) + actual = pd.read_excel(basename + ext, dtype=dtype) tm.assert_frame_equal(actual, expected) def test_reading_all_sheets(self, ext): @@ -504,7 +449,7 @@ def test_reading_all_sheets(self, ext): # Ensure a dict is returned. # See PR #9450 basename = 'test_multisheet' - dfs = self.get_exceldf(basename, ext, sheet_name=None) + dfs = pd.read_excel(basename + ext, sheet_name=None) # ensure this is not alphabetical to test order preservation expected_keys = ['Charlie', 'Alpha', 'Beta'] tm.assert_contains_all(expected_keys, dfs.keys()) @@ -521,7 +466,7 @@ def test_reading_multiple_specific_sheets(self, ext): basename = 'test_multisheet' # Explicitly request duplicates. Only the set should be returned. expected_keys = [2, 'Charlie', 'Charlie'] - dfs = self.get_exceldf(basename, ext, sheet_name=expected_keys) + dfs = pd.read_excel(basename + ext, sheet_name=expected_keys) expected_keys = list(set(expected_keys)) tm.assert_contains_all(expected_keys, dfs.keys()) assert len(expected_keys) == len(dfs.keys()) @@ -531,18 +476,18 @@ def test_reading_all_sheets_with_blank(self, ext): # In the case where some sheets are blank. # Issue #11711 basename = 'blank_with_header' - dfs = self.get_exceldf(basename, ext, sheet_name=None) + dfs = pd.read_excel(basename + ext, sheet_name=None) expected_keys = ['Sheet1', 'Sheet2', 'Sheet3'] tm.assert_contains_all(expected_keys, dfs.keys()) # GH6403 def test_read_excel_blank(self, ext): - actual = self.get_exceldf('blank', ext, 'Sheet1') + actual = pd.read_excel('blank' + ext, 'Sheet1') tm.assert_frame_equal(actual, DataFrame()) def test_read_excel_blank_with_header(self, ext): expected = DataFrame(columns=['col_1', 'col_2']) - actual = self.get_exceldf('blank_with_header', ext, 'Sheet1') + actual = pd.read_excel('blank_with_header' + ext, 'Sheet1') tm.assert_frame_equal(actual, expected) def test_date_conversion_overflow(self, ext): @@ -552,11 +497,11 @@ def test_date_conversion_overflow(self, ext): [1e+20, 'Timothy Brown']], columns=['DateColWithBigInt', 'StringCol']) - result = self.get_exceldf('testdateoverflow', ext) + result = pd.read_excel('testdateoverflow' + ext) tm.assert_frame_equal(result, expected) @td.skip_if_no("xlrd", "1.0.1") # see gh-22682 - def test_sheet_name_and_sheetname(self, ext): + def test_sheet_name_and_sheetname(self, ext, df_ref): # gh-10559: Minor improvement: Change "sheet_name" to "sheetname" # gh-10969: DOC: Consistent var names (sheetname vs sheet_name) # gh-12604: CLN GH10559 Rename sheetname variable to sheet_name @@ -565,14 +510,13 @@ def test_sheet_name_and_sheetname(self, ext): filename = "test1" sheet_name = "Sheet1" - df_ref = self.get_csv_refdf(filename) - df1 = self.get_exceldf(filename, ext, - sheet_name=sheet_name, index_col=0) # doc + df1 = pd.read_excel(filename + ext, + sheet_name=sheet_name, index_col=0) # doc with ignore_xlrd_time_clock_warning(): - df2 = self.get_exceldf(filename, ext, index_col=0, - sheet_name=sheet_name) + df2 = pd.read_excel(filename + ext, index_col=0, + sheet_name=sheet_name) - excel = self.get_excelfile(filename, ext) + excel = ExcelFile(filename + ext) df1_parse = excel.parse(sheet_name=sheet_name, index_col=0) # doc df2_parse = excel.parse(index_col=0, sheet_name=sheet_name) @@ -584,55 +528,53 @@ def test_sheet_name_and_sheetname(self, ext): def test_excel_read_buffer(self, ext): - pth = os.path.join(self.dirpath, 'test1' + ext) - expected = read_excel(pth, 'Sheet1', index_col=0) + pth = 'test1' + ext + expected = pd.read_excel(pth, 'Sheet1', index_col=0) with open(pth, 'rb') as f: - actual = read_excel(f, 'Sheet1', index_col=0) + actual = pd.read_excel(f, 'Sheet1', index_col=0) tm.assert_frame_equal(expected, actual) with open(pth, 'rb') as f: xls = ExcelFile(f) - actual = read_excel(xls, 'Sheet1', index_col=0) + actual = pd.read_excel(xls, 'Sheet1', index_col=0) tm.assert_frame_equal(expected, actual) def test_bad_engine_raises(self, ext): bad_engine = 'foo' with pytest.raises(ValueError, match="Unknown engine: foo"): - read_excel('', engine=bad_engine) + pd.read_excel('', engine=bad_engine) @tm.network def test_read_from_http_url(self, ext): url = ('https://raw.github.com/pandas-dev/pandas/master/' 'pandas/tests/io/data/test1' + ext) - url_table = read_excel(url) - local_table = self.get_exceldf('test1', ext) + url_table = pd.read_excel(url) + local_table = pd.read_excel('test1' + ext) tm.assert_frame_equal(url_table, local_table) @td.skip_if_not_us_locale def test_read_from_s3_url(self, ext, s3_resource): # Bucket "pandas-test" created in tests/io/conftest.py - file_name = os.path.join(self.dirpath, 'test1' + ext) - - with open(file_name, "rb") as f: + with open('test1' + ext, "rb") as f: s3_resource.Bucket("pandas-test").put_object(Key="test1" + ext, Body=f) url = ('s3://pandas-test/test1' + ext) - url_table = read_excel(url) - local_table = self.get_exceldf('test1', ext) + url_table = pd.read_excel(url) + local_table = pd.read_excel('test1' + ext) tm.assert_frame_equal(url_table, local_table) @pytest.mark.slow # ignore warning from old xlrd @pytest.mark.filterwarnings("ignore:This metho:PendingDeprecationWarning") - def test_read_from_file_url(self, ext): + def test_read_from_file_url(self, ext, datapath): # FILE - localtable = os.path.join(self.dirpath, 'test1' + ext) - local_table = read_excel(localtable) + localtable = os.path.join(datapath("io", "data"), 'test1' + ext) + local_table = pd.read_excel(localtable) try: - url_table = read_excel('file://localhost/' + localtable) + url_table = pd.read_excel('file://localhost/' + localtable) except URLError: # fails on some systems import platform @@ -646,11 +588,11 @@ def test_read_from_pathlib_path(self, ext): # GH12655 from pathlib import Path - str_path = os.path.join(self.dirpath, 'test1' + ext) - expected = read_excel(str_path, 'Sheet1', index_col=0) + str_path = 'test1' + ext + expected = pd.read_excel(str_path, 'Sheet1', index_col=0) - path_obj = Path(self.dirpath, 'test1' + ext) - actual = read_excel(path_obj, 'Sheet1', index_col=0) + path_obj = Path('test1' + ext) + actual = pd.read_excel(path_obj, 'Sheet1', index_col=0) tm.assert_frame_equal(expected, actual) @@ -660,22 +602,20 @@ def test_read_from_py_localpath(self, ext): # GH12655 from py.path import local as LocalPath - str_path = os.path.join(self.dirpath, 'test1' + ext) - expected = read_excel(str_path, 'Sheet1', index_col=0) + str_path = os.path.join('test1' + ext) + expected = pd.read_excel(str_path, 'Sheet1', index_col=0) - abs_dir = os.path.abspath(self.dirpath) - path_obj = LocalPath(abs_dir).join('test1' + ext) - actual = read_excel(path_obj, 'Sheet1', index_col=0) + path_obj = LocalPath().join('test1' + ext) + actual = pd.read_excel(path_obj, 'Sheet1', index_col=0) tm.assert_frame_equal(expected, actual) def test_reader_closes_file(self, ext): - pth = os.path.join(self.dirpath, 'test1' + ext) - f = open(pth, 'rb') + f = open('test1' + ext, 'rb') with ExcelFile(f) as xlsx: # parses okay - read_excel(xlsx, 'Sheet1', index_col=0) + pd.read_excel(xlsx, 'Sheet1', index_col=0) assert f.closed @@ -694,16 +634,16 @@ def test_reader_seconds(self, ext): time(16, 37, 0, 900000), time(18, 20, 54)]}) - actual = self.get_exceldf('times_1900', ext, 'Sheet1') + actual = pd.read_excel('times_1900' + ext, 'Sheet1') tm.assert_frame_equal(actual, expected) - actual = self.get_exceldf('times_1904', ext, 'Sheet1') + actual = pd.read_excel('times_1904' + ext, 'Sheet1') tm.assert_frame_equal(actual, expected) def test_read_excel_multiindex(self, ext): # see gh-4679 mi = MultiIndex.from_product([["foo", "bar"], ["a", "b"]]) - mi_file = os.path.join(self.dirpath, "testmultiindex" + ext) + mi_file = "testmultiindex" + ext # "mi_column" sheet expected = DataFrame([[1, 2.5, pd.Timestamp("2015-01-01"), True], @@ -712,34 +652,37 @@ def test_read_excel_multiindex(self, ext): [4, 5.5, pd.Timestamp("2015-01-04"), True]], columns=mi) - actual = read_excel(mi_file, "mi_column", header=[0, 1], index_col=0) + actual = pd.read_excel( + mi_file, "mi_column", header=[0, 1], index_col=0) tm.assert_frame_equal(actual, expected) # "mi_index" sheet expected.index = mi expected.columns = ["a", "b", "c", "d"] - actual = read_excel(mi_file, "mi_index", index_col=[0, 1]) + actual = pd.read_excel(mi_file, "mi_index", index_col=[0, 1]) tm.assert_frame_equal(actual, expected, check_names=False) # "both" sheet expected.columns = mi - actual = read_excel(mi_file, "both", index_col=[0, 1], header=[0, 1]) + actual = pd.read_excel( + mi_file, "both", index_col=[0, 1], header=[0, 1]) tm.assert_frame_equal(actual, expected, check_names=False) # "mi_index_name" sheet expected.columns = ["a", "b", "c", "d"] expected.index = mi.set_names(["ilvl1", "ilvl2"]) - actual = read_excel(mi_file, "mi_index_name", index_col=[0, 1]) + actual = pd.read_excel( + mi_file, "mi_index_name", index_col=[0, 1]) tm.assert_frame_equal(actual, expected) # "mi_column_name" sheet expected.index = list(range(4)) expected.columns = mi.set_names(["c1", "c2"]) - actual = read_excel(mi_file, "mi_column_name", - header=[0, 1], index_col=0) + actual = pd.read_excel(mi_file, "mi_column_name", + header=[0, 1], index_col=0) tm.assert_frame_equal(actual, expected) # see gh-11317 @@ -747,29 +690,29 @@ def test_read_excel_multiindex(self, ext): expected.columns = mi.set_levels( [1, 2], level=1).set_names(["c1", "c2"]) - actual = read_excel(mi_file, "name_with_int", - index_col=0, header=[0, 1]) + actual = pd.read_excel(mi_file, "name_with_int", + index_col=0, header=[0, 1]) tm.assert_frame_equal(actual, expected) # "both_name" sheet expected.columns = mi.set_names(["c1", "c2"]) expected.index = mi.set_names(["ilvl1", "ilvl2"]) - actual = read_excel(mi_file, "both_name", - index_col=[0, 1], header=[0, 1]) + actual = pd.read_excel(mi_file, "both_name", + index_col=[0, 1], header=[0, 1]) tm.assert_frame_equal(actual, expected) # "both_skiprows" sheet - actual = read_excel(mi_file, "both_name_skiprows", index_col=[0, 1], - header=[0, 1], skiprows=2) + actual = pd.read_excel(mi_file, "both_name_skiprows", index_col=[0, 1], + header=[0, 1], skiprows=2) tm.assert_frame_equal(actual, expected) def test_read_excel_multiindex_header_only(self, ext): # see gh-11733. # # Don't try to parse a header name if there isn't one. - mi_file = os.path.join(self.dirpath, "testmultiindex" + ext) - result = read_excel(mi_file, "index_col_none", header=[0, 1]) + mi_file = "testmultiindex" + ext + result = pd.read_excel(mi_file, "index_col_none", header=[0, 1]) exp_columns = MultiIndex.from_product([("A", "B"), ("key", "val")]) expected = DataFrame([[1, 2, 3, 4]] * 2, columns=exp_columns) @@ -778,7 +721,6 @@ def test_read_excel_multiindex_header_only(self, ext): def test_excel_old_index_format(self, ext): # see gh-4679 filename = "test_index_name_pre17" + ext - in_file = os.path.join(self.dirpath, filename) # We detect headers to determine if index names exist, so # that "index" name in the "names" version of the data will @@ -801,12 +743,12 @@ def test_excel_old_index_format(self, ext): expected = pd.DataFrame(data, index=si, columns=columns) - actual = pd.read_excel(in_file, "single_names", index_col=0) + actual = pd.read_excel(filename, "single_names", index_col=0) tm.assert_frame_equal(actual, expected) expected.index = mi - actual = pd.read_excel(in_file, "multi_names", index_col=[0, 1]) + actual = pd.read_excel(filename, "multi_names", index_col=[0, 1]) tm.assert_frame_equal(actual, expected) # The analogous versions of the "names" version data @@ -828,31 +770,28 @@ def test_excel_old_index_format(self, ext): expected = pd.DataFrame(data, index=si, columns=columns) - actual = pd.read_excel(in_file, "single_no_names", index_col=0) + actual = pd.read_excel(filename, "single_no_names", index_col=0) tm.assert_frame_equal(actual, expected) expected.index = mi - actual = pd.read_excel(in_file, "multi_no_names", index_col=[0, 1]) + actual = pd.read_excel(filename, "multi_no_names", index_col=[0, 1]) tm.assert_frame_equal(actual, expected, check_names=False) def test_read_excel_bool_header_arg(self, ext): # GH 6114 for arg in [True, False]: with pytest.raises(TypeError): - pd.read_excel(os.path.join(self.dirpath, 'test1' + ext), - header=arg) + pd.read_excel('test1' + ext, header=arg) def test_read_excel_chunksize(self, ext): # GH 8011 with pytest.raises(NotImplementedError): - pd.read_excel(os.path.join(self.dirpath, 'test1' + ext), - chunksize=100) + pd.read_excel('test1' + ext, chunksize=100) def test_read_excel_skiprows_list(self, ext): # GH 4903 - actual = pd.read_excel(os.path.join(self.dirpath, - 'testskiprows' + ext), + actual = pd.read_excel('testskiprows' + ext, 'skiprows_list', skiprows=[0, 2]) expected = DataFrame([[1, 2.5, pd.Timestamp('2015-01-01'), True], [2, 3.5, pd.Timestamp('2015-01-02'), False], @@ -861,41 +800,35 @@ def test_read_excel_skiprows_list(self, ext): columns=['a', 'b', 'c', 'd']) tm.assert_frame_equal(actual, expected) - actual = pd.read_excel(os.path.join(self.dirpath, - 'testskiprows' + ext), + actual = pd.read_excel('testskiprows' + ext, 'skiprows_list', skiprows=np.array([0, 2])) tm.assert_frame_equal(actual, expected) def test_read_excel_nrows(self, ext): # GH 16645 num_rows_to_pull = 5 - actual = pd.read_excel(os.path.join(self.dirpath, 'test1' + ext), - nrows=num_rows_to_pull) - expected = pd.read_excel(os.path.join(self.dirpath, - 'test1' + ext)) + actual = pd.read_excel('test1' + ext, nrows=num_rows_to_pull) + expected = pd.read_excel('test1' + ext) expected = expected[:num_rows_to_pull] tm.assert_frame_equal(actual, expected) def test_read_excel_nrows_greater_than_nrows_in_file(self, ext): # GH 16645 - expected = pd.read_excel(os.path.join(self.dirpath, - 'test1' + ext)) + expected = pd.read_excel('test1' + ext) num_records_in_file = len(expected) num_rows_to_pull = num_records_in_file + 10 - actual = pd.read_excel(os.path.join(self.dirpath, 'test1' + ext), - nrows=num_rows_to_pull) + actual = pd.read_excel('test1' + ext, nrows=num_rows_to_pull) tm.assert_frame_equal(actual, expected) def test_read_excel_nrows_non_integer_parameter(self, ext): # GH 16645 msg = "'nrows' must be an integer >=0" with pytest.raises(ValueError, match=msg): - pd.read_excel(os.path.join(self.dirpath, 'test1' + ext), - nrows='5') + pd.read_excel('test1' + ext, nrows='5') def test_read_excel_squeeze(self, ext): # GH 12157 - f = os.path.join(self.dirpath, 'test_squeeze' + ext) + f = 'test_squeeze' + ext actual = pd.read_excel(f, 'two_columns', index_col=0, squeeze=True) expected = pd.Series([2, 3, 4], [4, 5, 6], name='b') @@ -934,7 +867,7 @@ def test_read_one_empty_col_no_header(self, ext, header, expected): with ensure_clean(ext) as path: df.to_excel(path, filename, index=False, header=False) - result = read_excel(path, filename, usecols=[0], header=header) + result = pd.read_excel(path, filename, usecols=[0], header=header) tm.assert_frame_equal(result, expected) @@ -955,7 +888,7 @@ def test_read_one_empty_col_with_header(self, ext, header, expected): with ensure_clean(ext) as path: df.to_excel(path, 'with_header', index=False, header=True) - result = read_excel(path, filename, usecols=[0], header=header) + result = pd.read_excel(path, filename, usecols=[0], header=header) tm.assert_frame_equal(result, expected) @@ -976,10 +909,10 @@ def test_set_column_names_in_parameter(self, ext): refdf.columns = ['A', 'B'] with ExcelFile(pth) as reader: - xlsdf_no_head = read_excel(reader, 'Data_no_head', - header=None, names=['A', 'B']) - xlsdf_with_head = read_excel(reader, 'Data_with_head', - index_col=None, names=['A', 'B']) + xlsdf_no_head = pd.read_excel(reader, 'Data_no_head', + header=None, names=['A', 'B']) + xlsdf_with_head = pd.read_excel( + reader, 'Data_with_head', index_col=None, names=['A', 'B']) tm.assert_frame_equal(xlsdf_no_head, refdf) tm.assert_frame_equal(xlsdf_with_head, refdf) @@ -1005,7 +938,7 @@ def tdf(col_sheet_name): for sheetname, df in dfs.items(): df.to_excel(ew, sheetname) - dfs_returned = read_excel(pth, sheet_name=sheets, index_col=0) + dfs_returned = pd.read_excel(pth, sheet_name=sheets, index_col=0) for s in sheets: tm.assert_frame_equal(dfs[s], dfs_returned[s]) @@ -1101,15 +1034,15 @@ def test_read_excel_parse_dates(self, ext): with ensure_clean(ext) as pth: df2.to_excel(pth) - res = read_excel(pth, index_col=0) + res = pd.read_excel(pth, index_col=0) tm.assert_frame_equal(df2, res) - res = read_excel(pth, parse_dates=["date_strings"], index_col=0) + res = pd.read_excel(pth, parse_dates=["date_strings"], index_col=0) tm.assert_frame_equal(df, res) date_parser = lambda x: pd.datetime.strptime(x, "%m/%d/%Y") - res = read_excel(pth, parse_dates=["date_strings"], - date_parser=date_parser, index_col=0) + res = pd.read_excel(pth, parse_dates=["date_strings"], + date_parser=date_parser, index_col=0) tm.assert_frame_equal(df, res) @@ -1134,11 +1067,11 @@ def test_read_xlrd_book(self, ext): book = xlrd.open_workbook(pth) with ExcelFile(book, engine=engine) as xl: - result = read_excel(xl, sheet_name, index_col=0) + result = pd.read_excel(xl, sheet_name, index_col=0) tm.assert_frame_equal(df, result) - result = read_excel(book, sheet_name=sheet_name, - engine=engine, index_col=0) + result = pd.read_excel(book, sheet_name=sheet_name, + engine=engine, index_col=0) tm.assert_frame_equal(df, result) @@ -1192,12 +1125,12 @@ def test_excel_sheet_by_name_raise(self, *_): gt.to_excel(self.path) xl = ExcelFile(self.path) - df = read_excel(xl, 0, index_col=0) + df = pd.read_excel(xl, 0, index_col=0) tm.assert_frame_equal(gt, df) with pytest.raises(xlrd.XLRDError): - read_excel(xl, "0") + pd.read_excel(xl, "0") def test_excel_writer_context_manager(self, *_): with ExcelWriter(self.path) as writer: @@ -1205,8 +1138,8 @@ def test_excel_writer_context_manager(self, *_): self.frame2.to_excel(writer, "Data2") with ExcelFile(self.path) as reader: - found_df = read_excel(reader, "Data1", index_col=0) - found_df2 = read_excel(reader, "Data2", index_col=0) + found_df = pd.read_excel(reader, "Data1", index_col=0) + found_df2 = pd.read_excel(reader, "Data2", index_col=0) tm.assert_frame_equal(found_df, self.frame) tm.assert_frame_equal(found_df2, self.frame2) @@ -1221,47 +1154,49 @@ def test_roundtrip(self, merge_cells, engine, ext): # test roundtrip self.frame.to_excel(self.path, 'test1') - recons = read_excel(self.path, 'test1', index_col=0) + recons = pd.read_excel(self.path, 'test1', index_col=0) tm.assert_frame_equal(self.frame, recons) self.frame.to_excel(self.path, 'test1', index=False) - recons = read_excel(self.path, 'test1', index_col=None) + recons = pd.read_excel(self.path, 'test1', index_col=None) recons.index = self.frame.index tm.assert_frame_equal(self.frame, recons) self.frame.to_excel(self.path, 'test1', na_rep='NA') - recons = read_excel(self.path, 'test1', index_col=0, na_values=['NA']) + recons = pd.read_excel( + self.path, 'test1', index_col=0, na_values=['NA']) tm.assert_frame_equal(self.frame, recons) # GH 3611 self.frame.to_excel(self.path, 'test1', na_rep='88') - recons = read_excel(self.path, 'test1', index_col=0, na_values=['88']) + recons = pd.read_excel( + self.path, 'test1', index_col=0, na_values=['88']) tm.assert_frame_equal(self.frame, recons) self.frame.to_excel(self.path, 'test1', na_rep='88') - recons = read_excel(self.path, 'test1', index_col=0, - na_values=[88, 88.0]) + recons = pd.read_excel( + self.path, 'test1', index_col=0, na_values=[88, 88.0]) tm.assert_frame_equal(self.frame, recons) # GH 6573 self.frame.to_excel(self.path, 'Sheet1') - recons = read_excel(self.path, index_col=0) + recons = pd.read_excel(self.path, index_col=0) tm.assert_frame_equal(self.frame, recons) self.frame.to_excel(self.path, '0') - recons = read_excel(self.path, index_col=0) + recons = pd.read_excel(self.path, index_col=0) tm.assert_frame_equal(self.frame, recons) # GH 8825 Pandas Series should provide to_excel method s = self.frame["A"] s.to_excel(self.path) - recons = read_excel(self.path, index_col=0) + recons = pd.read_excel(self.path, index_col=0) tm.assert_frame_equal(s.to_frame(), recons) def test_mixed(self, merge_cells, engine, ext): self.mixed_frame.to_excel(self.path, 'test1') reader = ExcelFile(self.path) - recons = read_excel(reader, 'test1', index_col=0) + recons = pd.read_excel(reader, 'test1', index_col=0) tm.assert_frame_equal(self.mixed_frame, recons) def test_ts_frame(self, *_): @@ -1270,7 +1205,7 @@ def test_ts_frame(self, *_): df.to_excel(self.path, "test1") reader = ExcelFile(self.path) - recons = read_excel(reader, "test1", index_col=0) + recons = pd.read_excel(reader, "test1", index_col=0) tm.assert_frame_equal(df, recons) def test_basics_with_nan(self, merge_cells, engine, ext): @@ -1290,18 +1225,18 @@ def test_int_types(self, merge_cells, engine, ext, np_type): frame.to_excel(self.path, "test1") reader = ExcelFile(self.path) - recons = read_excel(reader, "test1", index_col=0) + recons = pd.read_excel(reader, "test1", index_col=0) int_frame = frame.astype(np.int64) tm.assert_frame_equal(int_frame, recons) - recons2 = read_excel(self.path, "test1", index_col=0) + recons2 = pd.read_excel(self.path, "test1", index_col=0) tm.assert_frame_equal(int_frame, recons2) # Test with convert_float=False comes back as float. float_frame = frame.astype(float) - recons = read_excel(self.path, "test1", - convert_float=False, index_col=0) + recons = pd.read_excel(self.path, "test1", + convert_float=False, index_col=0) tm.assert_frame_equal(recons, float_frame, check_index_type=False, check_column_type=False) @@ -1314,7 +1249,7 @@ def test_float_types(self, merge_cells, engine, ext, np_type): frame.to_excel(self.path, "test1") reader = ExcelFile(self.path) - recons = read_excel(reader, "test1", index_col=0).astype(np_type) + recons = pd.read_excel(reader, "test1", index_col=0).astype(np_type) tm.assert_frame_equal(frame, recons, check_dtype=False) @@ -1325,7 +1260,7 @@ def test_bool_types(self, merge_cells, engine, ext, np_type): frame.to_excel(self.path, "test1") reader = ExcelFile(self.path) - recons = read_excel(reader, "test1", index_col=0).astype(np_type) + recons = pd.read_excel(reader, "test1", index_col=0).astype(np_type) tm.assert_frame_equal(frame, recons) @@ -1334,7 +1269,7 @@ def test_inf_roundtrip(self, *_): frame.to_excel(self.path, "test1") reader = ExcelFile(self.path) - recons = read_excel(reader, "test1", index_col=0) + recons = pd.read_excel(reader, "test1", index_col=0) tm.assert_frame_equal(frame, recons) @@ -1352,9 +1287,9 @@ def test_sheets(self, merge_cells, engine, ext): self.tsframe.to_excel(writer, 'test2') writer.save() reader = ExcelFile(self.path) - recons = read_excel(reader, 'test1', index_col=0) + recons = pd.read_excel(reader, 'test1', index_col=0) tm.assert_frame_equal(self.frame, recons) - recons = read_excel(reader, 'test2', index_col=0) + recons = pd.read_excel(reader, 'test2', index_col=0) tm.assert_frame_equal(self.tsframe, recons) assert 2 == len(reader.sheet_names) assert 'test1' == reader.sheet_names[0] @@ -1372,7 +1307,7 @@ def test_colaliases(self, merge_cells, engine, ext): col_aliases = Index(['AA', 'X', 'Y', 'Z']) self.frame2.to_excel(self.path, 'test1', header=col_aliases) reader = ExcelFile(self.path) - rs = read_excel(reader, 'test1', index_col=0) + rs = pd.read_excel(reader, 'test1', index_col=0) xp = self.frame2.copy() xp.columns = col_aliases tm.assert_frame_equal(xp, rs) @@ -1391,9 +1326,8 @@ def test_roundtrip_indexlabels(self, merge_cells, engine, ext): index_label=['test'], merge_cells=merge_cells) reader = ExcelFile(self.path) - recons = read_excel(reader, 'test1', - index_col=0, - ).astype(np.int64) + recons = pd.read_excel( + reader, 'test1', index_col=0).astype(np.int64) frame.index.names = ['test'] assert frame.index.names == recons.index.names @@ -1403,9 +1337,8 @@ def test_roundtrip_indexlabels(self, merge_cells, engine, ext): index_label=['test', 'dummy', 'dummy2'], merge_cells=merge_cells) reader = ExcelFile(self.path) - recons = read_excel(reader, 'test1', - index_col=0, - ).astype(np.int64) + recons = pd.read_excel( + reader, 'test1', index_col=0).astype(np.int64) frame.index.names = ['test'] assert frame.index.names == recons.index.names @@ -1415,9 +1348,8 @@ def test_roundtrip_indexlabels(self, merge_cells, engine, ext): index_label='test', merge_cells=merge_cells) reader = ExcelFile(self.path) - recons = read_excel(reader, 'test1', - index_col=0, - ).astype(np.int64) + recons = pd.read_excel( + reader, 'test1', index_col=0).astype(np.int64) frame.index.names = ['test'] tm.assert_frame_equal(frame, recons.astype(bool)) @@ -1430,7 +1362,7 @@ def test_roundtrip_indexlabels(self, merge_cells, engine, ext): df = df.set_index(['A', 'B']) reader = ExcelFile(self.path) - recons = read_excel(reader, 'test1', index_col=[0, 1]) + recons = pd.read_excel(reader, 'test1', index_col=[0, 1]) tm.assert_frame_equal(df, recons, check_less_precise=True) def test_excel_roundtrip_indexname(self, merge_cells, engine, ext): @@ -1440,8 +1372,7 @@ def test_excel_roundtrip_indexname(self, merge_cells, engine, ext): df.to_excel(self.path, merge_cells=merge_cells) xf = ExcelFile(self.path) - result = read_excel(xf, xf.sheet_names[0], - index_col=0) + result = pd.read_excel(xf, xf.sheet_names[0], index_col=0) tm.assert_frame_equal(result, df) assert result.index.name == 'foo' @@ -1454,7 +1385,7 @@ def test_excel_roundtrip_datetime(self, merge_cells, *_): tsf.to_excel(self.path, "test1", merge_cells=merge_cells) reader = ExcelFile(self.path) - recons = read_excel(reader, "test1", index_col=0) + recons = pd.read_excel(reader, "test1", index_col=0) tm.assert_frame_equal(self.tsframe, recons) @@ -1488,8 +1419,8 @@ def test_excel_date_datetime_format(self, merge_cells, engine, ext): reader1 = ExcelFile(self.path) reader2 = ExcelFile(filename2) - rs1 = read_excel(reader1, "test1", index_col=0) - rs2 = read_excel(reader2, "test1", index_col=0) + rs1 = pd.read_excel(reader1, "test1", index_col=0) + rs2 = pd.read_excel(reader2, "test1", index_col=0) tm.assert_frame_equal(rs1, rs2) @@ -1511,7 +1442,7 @@ def test_to_excel_interval_no_labels(self, *_): frame.to_excel(self.path, "test1") reader = ExcelFile(self.path) - recons = read_excel(reader, "test1", index_col=0) + recons = pd.read_excel(reader, "test1", index_col=0) tm.assert_frame_equal(expected, recons) def test_to_excel_interval_labels(self, *_): @@ -1529,7 +1460,7 @@ def test_to_excel_interval_labels(self, *_): frame.to_excel(self.path, "test1") reader = ExcelFile(self.path) - recons = read_excel(reader, "test1", index_col=0) + recons = pd.read_excel(reader, "test1", index_col=0) tm.assert_frame_equal(expected, recons) def test_to_excel_timedelta(self, *_): @@ -1547,7 +1478,7 @@ def test_to_excel_timedelta(self, *_): frame.to_excel(self.path, "test1") reader = ExcelFile(self.path) - recons = read_excel(reader, "test1", index_col=0) + recons = pd.read_excel(reader, "test1", index_col=0) tm.assert_frame_equal(expected, recons) def test_to_excel_periodindex(self, merge_cells, engine, ext): @@ -1557,7 +1488,7 @@ def test_to_excel_periodindex(self, merge_cells, engine, ext): xp.to_excel(self.path, 'sht1') reader = ExcelFile(self.path) - rs = read_excel(reader, 'sht1', index_col=0) + rs = pd.read_excel(reader, 'sht1', index_col=0) tm.assert_frame_equal(xp, rs.to_period('M')) def test_to_excel_multiindex(self, merge_cells, engine, ext): @@ -1573,7 +1504,7 @@ def test_to_excel_multiindex(self, merge_cells, engine, ext): # round trip frame.to_excel(self.path, 'test1', merge_cells=merge_cells) reader = ExcelFile(self.path) - df = read_excel(reader, 'test1', index_col=[0, 1]) + df = pd.read_excel(reader, 'test1', index_col=[0, 1]) tm.assert_frame_equal(frame, df) # GH13511 @@ -1584,7 +1515,7 @@ def test_to_excel_multiindex_nan_label(self, merge_cells, engine, ext): frame = frame.set_index(['A', 'B']) frame.to_excel(self.path, merge_cells=merge_cells) - df = read_excel(self.path, index_col=[0, 1]) + df = pd.read_excel(self.path, index_col=[0, 1]) tm.assert_frame_equal(frame, df) # Test for Issue 11328. If column indices are integers, make @@ -1607,8 +1538,7 @@ def test_to_excel_multiindex_cols(self, merge_cells, engine, ext): # round trip frame.to_excel(self.path, 'test1', merge_cells=merge_cells) reader = ExcelFile(self.path) - df = read_excel(reader, 'test1', header=header, - index_col=[0, 1]) + df = pd.read_excel(reader, 'test1', header=header, index_col=[0, 1]) if not merge_cells: fm = frame.columns.format(sparsify=False, adjoin=False, names=False) @@ -1624,8 +1554,7 @@ def test_to_excel_multiindex_dates(self, merge_cells, engine, ext): tsframe.index.names = ['time', 'foo'] tsframe.to_excel(self.path, 'test1', merge_cells=merge_cells) reader = ExcelFile(self.path) - recons = read_excel(reader, 'test1', - index_col=[0, 1]) + recons = pd.read_excel(reader, 'test1', index_col=[0, 1]) tm.assert_frame_equal(tsframe, recons) assert recons.index.names == ('time', 'foo') @@ -1647,7 +1576,7 @@ def test_to_excel_multiindex_no_write_index(self, merge_cells, engine, # Read it back in. reader = ExcelFile(self.path) - frame3 = read_excel(reader, 'test1') + frame3 = pd.read_excel(reader, 'test1') # Test that it is the same as the initial frame. tm.assert_frame_equal(frame1, frame3) @@ -1659,7 +1588,7 @@ def test_to_excel_float_format(self, *_): df.to_excel(self.path, "test1", float_format="%.2f") reader = ExcelFile(self.path) - result = read_excel(reader, "test1", index_col=0) + result = pd.read_excel(reader, "test1", index_col=0) expected = DataFrame([[0.12, 0.23, 0.57], [12.32, 123123.20, 321321.20]], @@ -1675,8 +1604,8 @@ def test_to_excel_output_encoding(self, merge_cells, engine, ext): with ensure_clean("__tmp_to_excel_float_format__." + ext) as filename: df.to_excel(filename, sheet_name="TestSheet", encoding="utf8") - result = read_excel(filename, "TestSheet", - encoding="utf8", index_col=0) + result = pd.read_excel(filename, "TestSheet", + encoding="utf8", index_col=0) tm.assert_frame_equal(result, df) def test_to_excel_unicode_filename(self, merge_cells, engine, ext): @@ -1694,7 +1623,7 @@ def test_to_excel_unicode_filename(self, merge_cells, engine, ext): df.to_excel(filename, "test1", float_format="%.2f") reader = ExcelFile(filename) - result = read_excel(reader, "test1", index_col=0) + result = pd.read_excel(reader, "test1", index_col=0) expected = DataFrame([[0.12, 0.23, 0.57], [12.32, 123123.20, 321321.20]], @@ -1812,7 +1741,7 @@ def roundtrip(data, header=True, parser_hdr=0, index=True): merge_cells=merge_cells, index=index) xf = ExcelFile(self.path) - return read_excel(xf, xf.sheet_names[0], header=parser_hdr) + return pd.read_excel(xf, xf.sheet_names[0], header=parser_hdr) # Basic test. parser_header = 0 if use_headers else None @@ -1860,12 +1789,12 @@ def test_duplicated_columns(self, *_): columns=["A", "B", "B.1"]) # By default, we mangle. - result = read_excel(self.path, "test1", index_col=0) + result = pd.read_excel(self.path, "test1", index_col=0) tm.assert_frame_equal(result, expected) # Explicitly, we pass in the parameter. - result = read_excel(self.path, "test1", index_col=0, - mangle_dupe_cols=True) + result = pd.read_excel(self.path, "test1", index_col=0, + mangle_dupe_cols=True) tm.assert_frame_equal(result, expected) # see gh-11007, gh-10970 @@ -1873,21 +1802,22 @@ def test_duplicated_columns(self, *_): columns=["A", "B", "A", "B"]) df.to_excel(self.path, "test1") - result = read_excel(self.path, "test1", index_col=0) + result = pd.read_excel(self.path, "test1", index_col=0) expected = DataFrame([[1, 2, 3, 4], [5, 6, 7, 8]], columns=["A", "B", "A.1", "B.1"]) tm.assert_frame_equal(result, expected) # see gh-10982 df.to_excel(self.path, "test1", index=False, header=False) - result = read_excel(self.path, "test1", header=None) + result = pd.read_excel(self.path, "test1", header=None) expected = DataFrame([[1, 2, 3, 4], [5, 6, 7, 8]]) tm.assert_frame_equal(result, expected) msg = "Setting mangle_dupe_cols=False is not supported yet" with pytest.raises(ValueError, match=msg): - read_excel(self.path, "test1", header=None, mangle_dupe_cols=False) + pd.read_excel( + self.path, "test1", header=None, mangle_dupe_cols=False) def test_swapped_columns(self, merge_cells, engine, ext): # Test for issue #5427. @@ -1895,7 +1825,7 @@ def test_swapped_columns(self, merge_cells, engine, ext): 'B': [2, 2, 2]}) write_frame.to_excel(self.path, 'test1', columns=['B', 'A']) - read_frame = read_excel(self.path, 'test1', header=0) + read_frame = pd.read_excel(self.path, 'test1', header=0) tm.assert_series_equal(write_frame['A'], read_frame['A']) tm.assert_series_equal(write_frame['B'], read_frame['B']) @@ -1910,7 +1840,7 @@ def test_invalid_columns(self, *_): write_frame.to_excel(self.path, "test1", columns=["B", "C"]) expected = write_frame.reindex(columns=["B", "C"]) - read_frame = read_excel(self.path, "test1", index_col=0) + read_frame = pd.read_excel(self.path, "test1", index_col=0) tm.assert_frame_equal(expected, read_frame) with pytest.raises(KeyError): @@ -1919,7 +1849,7 @@ def test_invalid_columns(self, *_): def test_comment_arg(self, *_): # see gh-18735 # - # Test the comment argument functionality to read_excel. + # Test the comment argument functionality to pd.read_excel. # Create file to read in. df = DataFrame({"A": ["one", "#one", "one"], @@ -1927,18 +1857,18 @@ def test_comment_arg(self, *_): df.to_excel(self.path, "test_c") # Read file without comment arg. - result1 = read_excel(self.path, "test_c", index_col=0) + result1 = pd.read_excel(self.path, "test_c", index_col=0) result1.iloc[1, 0] = None result1.iloc[1, 1] = None result1.iloc[2, 1] = None - result2 = read_excel(self.path, "test_c", comment="#", index_col=0) + result2 = pd.read_excel(self.path, "test_c", comment="#", index_col=0) tm.assert_frame_equal(result1, result2) def test_comment_default(self, merge_cells, engine, ext): # Re issue #18735 - # Test the comment argument default to read_excel + # Test the comment argument default to pd.read_excel # Create file to read in df = DataFrame({'A': ['one', '#one', 'one'], @@ -1946,8 +1876,8 @@ def test_comment_default(self, merge_cells, engine, ext): df.to_excel(self.path, 'test_c') # Read file with default and explicit comment=None - result1 = read_excel(self.path, 'test_c') - result2 = read_excel(self.path, 'test_c', comment=None) + result1 = pd.read_excel(self.path, 'test_c') + result2 = pd.read_excel(self.path, 'test_c', comment=None) tm.assert_frame_equal(result1, result2) def test_comment_used(self, *_): @@ -1963,19 +1893,19 @@ def test_comment_used(self, *_): # Test read_frame_comment against manually produced expected output. expected = DataFrame({"A": ["one", None, "one"], "B": ["two", None, None]}) - result = read_excel(self.path, "test_c", comment="#", index_col=0) + result = pd.read_excel(self.path, "test_c", comment="#", index_col=0) tm.assert_frame_equal(result, expected) def test_comment_empty_line(self, merge_cells, engine, ext): # Re issue #18735 - # Test that read_excel ignores commented lines at the end of file + # Test that pd.read_excel ignores commented lines at the end of file df = DataFrame({'a': ['1', '#2'], 'b': ['2', '3']}) df.to_excel(self.path, index=False) # Test that all-comment lines at EoF are ignored expected = DataFrame({'a': [1], 'b': [2]}) - result = read_excel(self.path, comment='#') + result = pd.read_excel(self.path, comment='#') tm.assert_frame_equal(result, expected) def test_datetimes(self, merge_cells, engine, ext): @@ -1995,7 +1925,7 @@ def test_datetimes(self, merge_cells, engine, ext): write_frame = DataFrame({'A': datetimes}) write_frame.to_excel(self.path, 'Sheet1') - read_frame = read_excel(self.path, 'Sheet1', header=0) + read_frame = pd.read_excel(self.path, 'Sheet1', header=0) tm.assert_series_equal(write_frame['A'], read_frame['A']) @@ -2010,7 +1940,7 @@ def test_bytes_io(self, merge_cells, engine, ext): writer.save() bio.seek(0) - reread_df = read_excel(bio, index_col=0) + reread_df = pd.read_excel(bio, index_col=0) tm.assert_frame_equal(df, reread_df) def test_write_lists_dict(self, *_): @@ -2019,7 +1949,7 @@ def test_write_lists_dict(self, *_): "numeric": [1, 2, 3.0], "str": ["apple", "banana", "cherry"]}) df.to_excel(self.path, "Sheet1") - read = read_excel(self.path, "Sheet1", header=0, index_col=0) + read = pd.read_excel(self.path, "Sheet1", header=0, index_col=0) expected = df.copy() expected.mixed = expected.mixed.apply(str) @@ -2033,8 +1963,8 @@ def test_true_and_false_value_options(self, *_): expected = df.replace({"foo": True, "bar": False}) df.to_excel(self.path) - read_frame = read_excel(self.path, true_values=["foo"], - false_values=["bar"], index_col=0) + read_frame = pd.read_excel(self.path, true_values=["foo"], + false_values=["bar"], index_col=0) tm.assert_frame_equal(read_frame, expected) def test_freeze_panes(self, *_): @@ -2042,7 +1972,7 @@ def test_freeze_panes(self, *_): expected = DataFrame([[1, 2], [3, 4]], columns=["col1", "col2"]) expected.to_excel(self.path, "Sheet1", freeze_panes=(1, 1)) - result = read_excel(self.path, index_col=0) + result = pd.read_excel(self.path, index_col=0) tm.assert_frame_equal(result, expected) def test_path_path_lib(self, merge_cells, engine, ext): From 59df3e07d96d2463b62bd3fc38e11297590ed40d Mon Sep 17 00:00:00 2001 From: Tim Gates <47873678+timgates42@users.noreply.github.com> Date: Thu, 30 May 2019 11:27:47 +1000 Subject: [PATCH 04/51] Issue/26506 Provides correct desciption in docstring that get_indexer methods are not yet supported (#26519) --- pandas/core/indexes/base.py | 3 ++- pandas/core/indexes/interval.py | 12 ++++++++++-- 2 files changed, 12 insertions(+), 3 deletions(-) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index a4544e79e2dfa..8538687ca3e91 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -52,6 +52,7 @@ _index_doc_kwargs = dict(klass='Index', inplace='', target_klass='Index', + raises_section='', unique='Index', duplicated='np.ndarray') _index_shared_docs = dict() @@ -2787,7 +2788,7 @@ def get_loc(self, key, method=None, tolerance=None): Integers from 0 to n - 1 indicating that the index at these positions matches the corresponding target values. Missing values in the target are marked by -1. - + %(raises_section)s Examples -------- >>> index = pd.Index(['c', 'a', 'b']) diff --git a/pandas/core/indexes/interval.py b/pandas/core/indexes/interval.py index 956a6f73dd785..53e1a36c48994 100644 --- a/pandas/core/indexes/interval.py +++ b/pandas/core/indexes/interval.py @@ -8,7 +8,7 @@ from pandas._libs import Timedelta, Timestamp from pandas._libs.interval import Interval, IntervalMixin, IntervalTree -from pandas.util._decorators import Appender, cache_readonly +from pandas.util._decorators import Appender, Substitution, cache_readonly from pandas.util._exceptions import rewrite_exception from pandas.core.dtypes.cast import ( @@ -822,7 +822,15 @@ def get_value(self, series, key): loc = self.get_loc(key) return series.iloc[loc] - @Appender(_index_shared_docs['get_indexer'] % _index_doc_kwargs) + @Substitution(**dict(_index_doc_kwargs, + **{'raises_section': textwrap.dedent(""" + Raises + ------ + NotImplementedError + If any method argument other than the default of + None is specified as these are not yet implemented. + """)})) + @Appender(_index_shared_docs['get_indexer']) def get_indexer(self, target, method=None, limit=None, tolerance=None): self._check_method(method) From 072408ea8d654e5bbab270f11fbe61246e76691f Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Wed, 29 May 2019 20:29:40 -0500 Subject: [PATCH 05/51] ENH: Support nested renaming / selection (#26399) --- doc/source/user_guide/groupby.rst | 74 ++++++++-- doc/source/whatsnew/v0.25.0.rst | 41 ++++++ pandas/__init__.py | 2 +- pandas/core/api.py | 2 +- pandas/core/base.py | 14 +- pandas/core/groupby/__init__.py | 4 +- pandas/core/groupby/generic.py | 128 ++++++++++++++++-- pandas/tests/api/test_api.py | 1 + .../tests/groupby/aggregate/test_aggregate.py | 101 +++++++++++++- pandas/tests/groupby/aggregate/test_other.py | 1 + 10 files changed, 337 insertions(+), 31 deletions(-) diff --git a/doc/source/user_guide/groupby.rst b/doc/source/user_guide/groupby.rst index 4f116a42253e5..2014dbd9865f3 100644 --- a/doc/source/user_guide/groupby.rst +++ b/doc/source/user_guide/groupby.rst @@ -568,6 +568,67 @@ For a grouped ``DataFrame``, you can rename in a similar manner: 'mean': 'bar', 'std': 'baz'})) +.. _groupby.aggregate.named: + +Named Aggregation +~~~~~~~~~~~~~~~~~ + +.. versionadded:: 0.25.0 + +To support column-specific aggregation *with control over the output column names*, pandas +accepts the special syntax in :meth:`GroupBy.agg`, known as "named aggregation", where + +- The keywords are the *output* column names +- The values are tuples whose first element is the column to select + and the second element is the aggregation to apply to that column. Pandas + provides the ``pandas.NamedAgg`` namedtuple with the fields ``['column', 'aggfunc']`` + to make it clearer what the arguments are. As usual, the aggregation can + be a callable or a string alias. + +.. ipython:: python + + animals = pd.DataFrame({'kind': ['cat', 'dog', 'cat', 'dog'], + 'height': [9.1, 6.0, 9.5, 34.0], + 'weight': [7.9, 7.5, 9.9, 198.0]}) + animals + + animals.groupby("kind").agg( + min_height=pd.NamedAgg(column='height', aggfunc='min'), + max_height=pd.NamedAgg(column='height', aggfunc='max'), + average_weight=pd.NamedAgg(column='height', aggfunc=np.mean), + ) + + +``pandas.NamedAgg`` is just a ``namedtuple``. Plain tuples are allowed as well. + +.. ipython:: python + + animals.groupby("kind").agg( + min_height=('height', 'min'), + max_height=('height', 'max'), + average_weight=('height', np.mean), + ) + + +If your desired output column names are not valid python keywords, construct a dictionary +and unpack the keyword arguments + +.. ipython:: python + + animals.groupby("kind").agg(**{ + 'total weight': pd.NamedAgg(column='weight', aggfunc=sum), + }) + +Additional keyword arguments are not passed through to the aggregation functions. Only pairs +of ``(column, aggfunc)`` should be passed as ``**kwargs``. If your aggregation functions +requires additional arguments, partially apply them with :meth:`functools.partial`. + +.. note:: + + For Python 3.5 and earlier, the order of ``**kwargs`` in a functions was not + preserved. This means that the output column ordering would not be + consistent. To ensure consistent ordering, the keys (and so output columns) + will always be sorted for Python 3.5. Applying different functions to DataFrame columns ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -588,19 +649,6 @@ must be either implemented on GroupBy or available via :ref:`dispatching grouped.agg({'C': 'sum', 'D': 'std'}) -.. note:: - - If you pass a dict to ``aggregate``, the ordering of the output columns is - non-deterministic. If you want to be sure the output columns will be in a specific - order, you can use an ``OrderedDict``. Compare the output of the following two commands: - -.. ipython:: python - - from collections import OrderedDict - - grouped.agg({'D': 'std', 'C': 'mean'}) - grouped.agg(OrderedDict([('D', 'std'), ('C', 'mean')])) - .. _groupby.aggregate.cython: Cython-optimized aggregation functions diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index 2c66d3e4db321..96837916f815b 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -19,6 +19,47 @@ These are the changes in pandas 0.25.0. See :ref:`release` for a full changelog including other versions of pandas. +Enhancements +~~~~~~~~~~~~ + +.. _whatsnew_0250.enhancements.agg_relabel: + +Groupby Aggregation with Relabeling +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Pandas has added special groupby behavior, known as "named aggregation", for naming the +output columns when applying multiple aggregation functions to specific columns (:issue:`18366`). + +.. ipython:: python + + animals = pd.DataFrame({'kind': ['cat', 'dog', 'cat', 'dog'], + 'height': [9.1, 6.0, 9.5, 34.0], + 'weight': [7.9, 7.5, 9.9, 198.0]}) + animals + animals.groupby("kind").agg( + min_height=pd.NamedAgg(column='height', aggfunc='min'), + max_height=pd.NamedAgg(column='height', aggfunc='max'), + average_weight=pd.NamedAgg(column='height', aggfunc=np.mean), + ) + +Pass the desired columns names as the ``**kwargs`` to ``.agg``. The values of ``**kwargs`` +should be tuples where the first element is the column selection, and the second element is the +aggregation function to apply. Pandas provides the ``pandas.NamedAgg`` namedtuple to make it clearer +what the arguments to the function are, but plain tuples are accepted as well. + +.. ipython:: python + + animals.groupby("kind").agg( + min_height=('height', 'min'), + max_height=('height', 'max'), + average_weight=('height', np.mean), + ) + +Named aggregation is the recommended replacement for the deprecated "dict-of-dicts" +approach to naming the output of column-specific aggregations (:ref:`whatsnew_0200.api_breaking.deprecate_group_agg_dict`). + +See :ref:`_groupby.aggregate.named` for more. + .. _whatsnew_0250.enhancements.other: Other Enhancements diff --git a/pandas/__init__.py b/pandas/__init__.py index 6af6f3093c120..4c494b4a62e39 100644 --- a/pandas/__init__.py +++ b/pandas/__init__.py @@ -65,7 +65,7 @@ to_numeric, to_datetime, to_timedelta, # misc - np, Grouper, factorize, unique, value_counts, + np, Grouper, factorize, unique, value_counts, NamedAgg, array, Categorical, set_eng_float_format, Series, DataFrame, Panel) diff --git a/pandas/core/api.py b/pandas/core/api.py index b7398e433f28f..0106feabcce74 100644 --- a/pandas/core/api.py +++ b/pandas/core/api.py @@ -21,7 +21,7 @@ DatetimeTZDtype, ) from pandas.core.arrays import Categorical, array -from pandas.core.groupby import Grouper +from pandas.core.groupby import Grouper, NamedAgg from pandas.io.formats.format import set_eng_float_format from pandas.core.index import (Index, CategoricalIndex, Int64Index, UInt64Index, RangeIndex, Float64Index, diff --git a/pandas/core/base.py b/pandas/core/base.py index 3f59871fb5b38..e4274e48d3227 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -340,11 +340,15 @@ def _aggregate(self, arg, *args, **kwargs): def nested_renaming_depr(level=4): # deprecation of nested renaming # GH 15931 - warnings.warn( - ("using a dict with renaming " - "is deprecated and will be removed in a future " - "version"), - FutureWarning, stacklevel=level) + msg = textwrap.dedent("""\ + using a dict with renaming is deprecated and will be removed + in a future version. + + For column-specific groupby renaming, use named aggregation + + >>> df.groupby(...).agg(name=('column', aggfunc)) + """) + warnings.warn(msg, FutureWarning, stacklevel=level) # if we have a dict of any non-scalars # eg. {'A' : ['mean']}, normalize all to diff --git a/pandas/core/groupby/__init__.py b/pandas/core/groupby/__init__.py index ac35f3825e5e8..fe50bd91a4f56 100644 --- a/pandas/core/groupby/__init__.py +++ b/pandas/core/groupby/__init__.py @@ -1,4 +1,4 @@ -from pandas.core.groupby.groupby import GroupBy # noqa: F401 from pandas.core.groupby.generic import ( # noqa: F401 - SeriesGroupBy, DataFrameGroupBy) + DataFrameGroupBy, NamedAgg, SeriesGroupBy) +from pandas.core.groupby.groupby import GroupBy # noqa: F401 from pandas.core.groupby.grouper import Grouper # noqa: F401 diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 2f665975f96bd..faa4d868bb65a 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -6,15 +6,18 @@ which here returns a DataFrameGroupBy object. """ -from collections import OrderedDict, abc +from collections import OrderedDict, abc, namedtuple import copy from functools import partial from textwrap import dedent +import typing +from typing import Any, Callable, List, Union import warnings import numpy as np from pandas._libs import Timestamp, lib +from pandas.compat import PY36 from pandas.errors import AbstractMethodError from pandas.util._decorators import Appender, Substitution @@ -41,6 +44,10 @@ from pandas.plotting._core import boxplot_frame_groupby +NamedAgg = namedtuple("NamedAgg", ["column", "aggfunc"]) +# TODO(typing) the return value on this callable should be any *scalar*. +AggScalar = Union[str, Callable[..., Any]] + class NDFrameGroupBy(GroupBy): @@ -144,8 +151,18 @@ def _cython_agg_blocks(self, how, alt=None, numeric_only=True, return new_items, new_blocks def aggregate(self, func, *args, **kwargs): - _level = kwargs.pop('_level', None) + + relabeling = func is None and _is_multi_agg_with_relabel(**kwargs) + if relabeling: + func, columns, order = _normalize_keyword_aggregation(kwargs) + + kwargs = {} + elif func is None: + # nicer error message + raise TypeError("Must provide 'func' or tuples of " + "'(column, aggfunc).") + result, how = self._aggregate(func, _level=_level, *args, **kwargs) if how is None: return result @@ -179,6 +196,10 @@ def aggregate(self, func, *args, **kwargs): self._insert_inaxis_grouper_inplace(result) result.index = np.arange(len(result)) + if relabeling: + result = result[order] + result.columns = columns + return result._convert(datetime=True) agg = aggregate @@ -791,11 +812,8 @@ def _aggregate_multiple_funcs(self, arg, _level): # list of functions / function names columns = [] for f in arg: - if isinstance(f, str): - columns.append(f) - else: - # protect against callables without names - columns.append(com.get_callable_name(f)) + columns.append(com.get_callable_name(f) or f) + arg = zip(columns, arg) results = OrderedDict() @@ -1296,6 +1314,26 @@ class DataFrameGroupBy(NDFrameGroupBy): A 1 1 2 0.590716 2 3 4 0.704907 + + To control the output names with different aggregations per column, + pandas supports "named aggregation" + + >>> df.groupby("A").agg( + ... b_min=pd.NamedAgg(column="B", aggfunc="min"), + ... c_sum=pd.NamedAgg(column="C", aggfunc="sum")) + b_min c_sum + A + 1 1 -1.956929 + 2 3 -0.322183 + + - The keywords are the *output* column names + - The values are tuples whose first element is the column to select + and the second element is the aggregation to apply to that column. + Pandas provides the ``pandas.NamedAgg`` namedtuple with the fields + ``['column', 'aggfunc']`` to make it clearer what the arguments are. + As usual, the aggregation can be a callable or a string alias. + + See :ref:`groupby.aggregate.named` for more. """) @Substitution(see_also=_agg_see_also_doc, @@ -1304,7 +1342,7 @@ class DataFrameGroupBy(NDFrameGroupBy): klass='DataFrame', axis='') @Appender(_shared_docs['aggregate']) - def aggregate(self, arg, *args, **kwargs): + def aggregate(self, arg=None, *args, **kwargs): return super().aggregate(arg, *args, **kwargs) agg = aggregate @@ -1577,3 +1615,77 @@ def groupby_series(obj, col=None): return results boxplot = boxplot_frame_groupby + + +def _is_multi_agg_with_relabel(**kwargs): + """ + Check whether the kwargs pass to .agg look like multi-agg with relabling. + + Parameters + ---------- + **kwargs : dict + + Returns + ------- + bool + + Examples + -------- + >>> _is_multi_agg_with_relabel(a='max') + False + >>> _is_multi_agg_with_relabel(a_max=('a', 'max'), + ... a_min=('a', 'min')) + True + >>> _is_multi_agg_with_relabel() + False + """ + return all( + isinstance(v, tuple) and len(v) == 2 + for v in kwargs.values() + ) and kwargs + + +def _normalize_keyword_aggregation(kwargs): + """ + Normalize user-provided "named aggregation" kwargs. + + Transforms from the new ``Dict[str, NamedAgg]`` style kwargs + to the old OrderedDict[str, List[scalar]]]. + + Parameters + ---------- + kwargs : dict + + Returns + ------- + aggspec : dict + The transformed kwargs. + columns : List[str] + The user-provided keys. + order : List[Tuple[str, str]] + Pairs of the input and output column names. + + Examples + -------- + >>> _normalize_keyword_aggregation({'output': ('input', 'sum')}) + (OrderedDict([('input', ['sum'])]), ('output',), [('input', 'sum')]) + """ + if not PY36: + kwargs = OrderedDict(sorted(kwargs.items())) + + # Normalize the aggregation functions as Dict[column, List[func]], + # process normally, then fixup the names. + # TODO(Py35): When we drop python 3.5, change this to + # defaultdict(list) + aggspec = OrderedDict() # type: typing.OrderedDict[str, List[AggScalar]] + order = [] + columns, pairs = list(zip(*kwargs.items())) + + for name, (column, aggfunc) in zip(columns, pairs): + if column in aggspec: + aggspec[column].append(aggfunc) + else: + aggspec[column] = [aggfunc] + order.append((column, + com.get_callable_name(aggfunc) or aggfunc)) + return aggspec, columns, order diff --git a/pandas/tests/api/test_api.py b/pandas/tests/api/test_api.py index c92808200ebea..aa42484bf9513 100644 --- a/pandas/tests/api/test_api.py +++ b/pandas/tests/api/test_api.py @@ -47,6 +47,7 @@ class TestPDApi(Base): 'DatetimeTZDtype', 'Int8Dtype', 'Int16Dtype', 'Int32Dtype', 'Int64Dtype', 'UInt8Dtype', 'UInt16Dtype', 'UInt32Dtype', 'UInt64Dtype', + 'NamedAgg', ] # these are already deprecated; awaiting removal diff --git a/pandas/tests/groupby/aggregate/test_aggregate.py b/pandas/tests/groupby/aggregate/test_aggregate.py index 6f54d05680698..9e714a1086037 100644 --- a/pandas/tests/groupby/aggregate/test_aggregate.py +++ b/pandas/tests/groupby/aggregate/test_aggregate.py @@ -2,12 +2,13 @@ test .agg behavior / note that .apply is tested generally in test_groupby.py """ from collections import OrderedDict +import functools import numpy as np import pytest import pandas as pd -from pandas import DataFrame, Index, MultiIndex, Series, concat +from pandas import DataFrame, Index, MultiIndex, Series, compat, concat from pandas.core.base import SpecificationError from pandas.core.groupby.grouper import Grouping import pandas.util.testing as tm @@ -326,3 +327,101 @@ def test_uint64_type_handling(dtype, how): result = df.groupby('y').agg({'x': how}) result.x = result.x.astype(np.int64) tm.assert_frame_equal(result, expected, check_exact=True) + + +class TestNamedAggregation: + + def test_agg_relabel(self): + df = pd.DataFrame({"group": ['a', 'a', 'b', 'b'], + "A": [0, 1, 2, 3], + "B": [5, 6, 7, 8]}) + result = df.groupby("group").agg( + a_max=("A", "max"), + b_max=("B", "max"), + ) + expected = pd.DataFrame({"a_max": [1, 3], "b_max": [6, 8]}, + index=pd.Index(['a', 'b'], name='group'), + columns=['a_max', 'b_max']) + tm.assert_frame_equal(result, expected) + + # order invariance + p98 = functools.partial(np.percentile, q=98) + result = df.groupby('group').agg( + b_min=("B", "min"), + a_min=("A", min), + a_mean=("A", np.mean), + a_max=("A", "max"), + b_max=("B", "max"), + a_98=("A", p98) + ) + expected = pd.DataFrame({"b_min": [5, 7], + "a_min": [0, 2], + "a_mean": [0.5, 2.5], + "a_max": [1, 3], + "b_max": [6, 8], + "a_98": [0.98, 2.98]}, + index=pd.Index(['a', 'b'], name='group'), + columns=['b_min', 'a_min', 'a_mean', + 'a_max', 'b_max', 'a_98']) + if not compat.PY36: + expected = expected[['a_98', 'a_max', 'a_mean', + 'a_min', 'b_max', 'b_min']] + tm.assert_frame_equal(result, expected) + + def test_agg_relabel_non_identifier(self): + df = pd.DataFrame({"group": ['a', 'a', 'b', 'b'], + "A": [0, 1, 2, 3], + "B": [5, 6, 7, 8]}) + + result = df.groupby("group").agg(**{'my col': ('A', 'max')}) + expected = pd.DataFrame({'my col': [1, 3]}, + index=pd.Index(['a', 'b'], name='group')) + tm.assert_frame_equal(result, expected) + + def test_duplicate_raises(self): + # TODO: we currently raise on multiple lambdas. We could *maybe* + # update com.get_callable_name to append `_i` to each lambda. + df = pd.DataFrame({"A": [0, 0, 1, 1], "B": [1, 2, 3, 4]}) + with pytest.raises(SpecificationError, match="Function names"): + df.groupby("A").agg(a=("A", "min"), b=("A", "min")) + + def test_agg_relabel_with_level(self): + df = pd.DataFrame({"A": [0, 0, 1, 1], "B": [1, 2, 3, 4]}, + index=pd.MultiIndex.from_product([['A', 'B'], + ['a', 'b']])) + result = df.groupby(level=0).agg(aa=('A', 'max'), bb=('A', 'min'), + cc=('B', 'mean')) + expected = pd.DataFrame({ + 'aa': [0, 1], + 'bb': [0, 1], + 'cc': [1.5, 3.5] + }, index=['A', 'B']) + tm.assert_frame_equal(result, expected) + + def test_agg_relabel_other_raises(self): + df = pd.DataFrame({"A": [0, 0, 1], "B": [1, 2, 3]}) + grouped = df.groupby("A") + match = 'Must provide' + with pytest.raises(TypeError, match=match): + grouped.agg(foo=1) + + with pytest.raises(TypeError, match=match): + grouped.agg() + + with pytest.raises(TypeError, match=match): + grouped.agg(a=('B', 'max'), b=(1, 2, 3)) + + def test_missing_raises(self): + df = pd.DataFrame({"A": [0, 1], "B": [1, 2]}) + with pytest.raises(KeyError, match="Column 'C' does not exist"): + df.groupby("A").agg(c=('C', 'sum')) + + def test_agg_namedtuple(self): + df = pd.DataFrame({"A": [0, 1], "B": [1, 2]}) + result = df.groupby("A").agg( + b=pd.NamedAgg("B", "sum"), + c=pd.NamedAgg(column="B", aggfunc="count") + ) + expected = df.groupby("A").agg(b=("B", "sum"), + c=("B", "count")) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/groupby/aggregate/test_other.py b/pandas/tests/groupby/aggregate/test_other.py index 02d8c09bf2c8f..8168cf06ffdb1 100644 --- a/pandas/tests/groupby/aggregate/test_other.py +++ b/pandas/tests/groupby/aggregate/test_other.py @@ -217,6 +217,7 @@ def test_agg_dict_renaming_deprecation(): df.groupby('A').agg({'B': {'foo': ['sum', 'max']}, 'C': {'bar': ['count', 'min']}}) assert "using a dict with renaming" in str(w[0].message) + assert "named aggregation" in str(w[0].message) with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): df.groupby('A')[['B', 'C']].agg({'ma': 'max'}) From 9e76f4a8d3374cd4f21b2a531b19f58a686136ab Mon Sep 17 00:00:00 2001 From: krsnik93 Date: Thu, 30 May 2019 02:33:49 +0100 Subject: [PATCH 06/51] Fix 'observed' kwarg not doing anything on SeriesGroupBy (#26463) --- doc/source/whatsnew/v0.25.0.rst | 1 + pandas/core/groupby/generic.py | 93 ++------ pandas/core/groupby/groupby.py | 76 ++++++- pandas/tests/groupby/test_categorical.py | 264 +++++++++++++++-------- 4 files changed, 268 insertions(+), 166 deletions(-) diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index 96837916f815b..89a9da4a73b35 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -553,6 +553,7 @@ Groupby/Resample/Rolling - Bug in :func:`pandas.core.groupby.GroupBy.agg` when applying a aggregation function to timezone aware data (:issue:`23683`) - Bug in :func:`pandas.core.groupby.GroupBy.first` and :func:`pandas.core.groupby.GroupBy.last` where timezone information would be dropped (:issue:`21603`) - Bug in :func:`pandas.core.groupby.GroupBy.size` when grouping only NA values (:issue:`23050`) +- Bug in :func:`Series.groupby` where ``observed`` kwarg was previously ignored (:issue:`24880`) - Bug in :func:`Series.groupby` where using ``groupby`` with a :class:`MultiIndex` Series with a list of labels equal to the length of the series caused incorrect grouping (:issue:`25704`) - Ensured that ordering of outputs in ``groupby`` aggregation functions is consistent across all versions of Python (:issue:`25692`) - Ensured that result group order is correct when grouping on an ordered ``Categorical`` and specifying ``observed=True`` (:issue:`25871`, :issue:`25167`) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index faa4d868bb65a..121244cde368a 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -28,7 +28,6 @@ from pandas.core.dtypes.missing import isna, notna import pandas.core.algorithms as algorithms -from pandas.core.arrays import Categorical from pandas.core.base import DataError, SpecificationError import pandas.core.common as com from pandas.core.frame import DataFrame @@ -36,7 +35,7 @@ from pandas.core.groupby import base from pandas.core.groupby.groupby import ( GroupBy, _apply_docs, _transform_template) -from pandas.core.index import CategoricalIndex, Index, MultiIndex +from pandas.core.index import Index, MultiIndex import pandas.core.indexes.base as ibase from pandas.core.internals import BlockManager, make_block from pandas.core.series import Series @@ -852,9 +851,10 @@ def _wrap_output(self, output, index, names=None): return Series(output, index=index, name=name) def _wrap_aggregated_output(self, output, names=None): - return self._wrap_output(output=output, - index=self.grouper.result_index, - names=names) + result = self._wrap_output(output=output, + index=self.grouper.result_index, + names=names) + return self._reindex_output(result)._convert(datetime=True) def _wrap_transformed_output(self, output, names=None): return self._wrap_output(output=output, @@ -874,13 +874,16 @@ def _get_index(): return index if isinstance(values[0], dict): - # GH #823 + # GH #823 #24880 index = _get_index() - result = DataFrame(values, index=index).stack() + result = self._reindex_output(DataFrame(values, index=index)) + # if self.observed is False, + # keep all-NaN rows created while re-indexing + result = result.stack(dropna=self.observed) result.name = self._selection_name return result - if isinstance(values[0], (Series, dict)): + if isinstance(values[0], Series): return self._concat_objects(keys, values, not_indexed_same=not_indexed_same) elif isinstance(values[0], DataFrame): @@ -888,9 +891,11 @@ def _get_index(): return self._concat_objects(keys, values, not_indexed_same=not_indexed_same) else: - # GH #6265 - return Series(values, index=_get_index(), - name=self._selection_name) + # GH #6265 #24880 + result = Series(data=values, + index=_get_index(), + name=self._selection_name) + return self._reindex_output(result) def _aggregate_named(self, func, *args, **kwargs): result = OrderedDict() @@ -1373,7 +1378,8 @@ def _gotitem(self, key, ndim, subset=None): if subset is None: subset = self.obj[key] return SeriesGroupBy(subset, selection=key, - grouper=self.grouper) + grouper=self.grouper, + observed=self.observed) raise AssertionError("invalid ndim for _gotitem") @@ -1445,69 +1451,6 @@ def _wrap_agged_blocks(self, items, blocks): return self._reindex_output(result)._convert(datetime=True) - def _reindex_output(self, result): - """ - If we have categorical groupers, then we want to make sure that - we have a fully reindex-output to the levels. These may have not - participated in the groupings (e.g. may have all been - nan groups); - - This can re-expand the output space - """ - - # we need to re-expand the output space to accomodate all values - # whether observed or not in the cartesian product of our groupes - groupings = self.grouper.groupings - if groupings is None: - return result - elif len(groupings) == 1: - return result - - # if we only care about the observed values - # we are done - elif self.observed: - return result - - # reindexing only applies to a Categorical grouper - elif not any(isinstance(ping.grouper, (Categorical, CategoricalIndex)) - for ping in groupings): - return result - - levels_list = [ping.group_index for ping in groupings] - index, _ = MultiIndex.from_product( - levels_list, names=self.grouper.names).sortlevel() - - if self.as_index: - d = {self.obj._get_axis_name(self.axis): index, 'copy': False} - return result.reindex(**d) - - # GH 13204 - # Here, the categorical in-axis groupers, which need to be fully - # expanded, are columns in `result`. An idea is to do: - # result = result.set_index(self.grouper.names) - # .reindex(index).reset_index() - # but special care has to be taken because of possible not-in-axis - # groupers. - # So, we manually select and drop the in-axis grouper columns, - # reindex `result`, and then reset the in-axis grouper columns. - - # Select in-axis groupers - in_axis_grps = ((i, ping.name) for (i, ping) - in enumerate(groupings) if ping.in_axis) - g_nums, g_names = zip(*in_axis_grps) - - result = result.drop(labels=list(g_names), axis=1) - - # Set a temp index and reindex (possibly expanding) - result = result.set_index(self.grouper.result_index - ).reindex(index, copy=False) - - # Reset in-axis grouper columns - # (using level numbers `g_nums` because level names may not be unique) - result = result.reset_index(level=g_nums) - - return result.reset_index(drop=True) - def _iterate_column_groupbys(self): for i, colname in enumerate(self._selected_obj.columns): yield colname, SeriesGroupBy(self._selected_obj.iloc[:, i], diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index aa04b7505afe4..91bb71a1a8af7 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -36,13 +36,14 @@ class providing the base-class of operations. from pandas.api.types import ( is_datetime64_dtype, is_integer_dtype, is_object_dtype) import pandas.core.algorithms as algorithms +from pandas.core.arrays import Categorical from pandas.core.base import ( DataError, GroupByError, PandasObject, SelectionMixin, SpecificationError) import pandas.core.common as com from pandas.core.frame import DataFrame from pandas.core.generic import NDFrame from pandas.core.groupby import base -from pandas.core.index import Index, MultiIndex +from pandas.core.index import CategoricalIndex, Index, MultiIndex from pandas.core.series import Series from pandas.core.sorting import get_group_index_sorter @@ -2301,6 +2302,79 @@ def tail(self, n=5): mask = self._cumcount_array(ascending=False) < n return self._selected_obj[mask] + def _reindex_output(self, output): + """ + If we have categorical groupers, then we might want to make sure that + we have a fully re-indexed output to the levels. This means expanding + the output space to accommodate all values in the cartesian product of + our groups, regardless of whether they were observed in the data or + not. This will expand the output space if there are missing groups. + + The method returns early without modifying the input if the number of + groupings is less than 2, self.observed == True or none of the groupers + are categorical. + + Parameters + ---------- + output: Series or DataFrame + Object resulting from grouping and applying an operation. + + Returns + ------- + Series or DataFrame + Object (potentially) re-indexed to include all possible groups. + """ + groupings = self.grouper.groupings + if groupings is None: + return output + elif len(groupings) == 1: + return output + + # if we only care about the observed values + # we are done + elif self.observed: + return output + + # reindexing only applies to a Categorical grouper + elif not any(isinstance(ping.grouper, (Categorical, CategoricalIndex)) + for ping in groupings): + return output + + levels_list = [ping.group_index for ping in groupings] + index, _ = MultiIndex.from_product( + levels_list, names=self.grouper.names).sortlevel() + + if self.as_index: + d = {self.obj._get_axis_name(self.axis): index, 'copy': False} + return output.reindex(**d) + + # GH 13204 + # Here, the categorical in-axis groupers, which need to be fully + # expanded, are columns in `output`. An idea is to do: + # output = output.set_index(self.grouper.names) + # .reindex(index).reset_index() + # but special care has to be taken because of possible not-in-axis + # groupers. + # So, we manually select and drop the in-axis grouper columns, + # reindex `output`, and then reset the in-axis grouper columns. + + # Select in-axis groupers + in_axis_grps = ((i, ping.name) for (i, ping) + in enumerate(groupings) if ping.in_axis) + g_nums, g_names = zip(*in_axis_grps) + + output = output.drop(labels=list(g_names), axis=1) + + # Set a temp index and reindex (possibly expanding) + output = output.set_index(self.grouper.result_index + ).reindex(index, copy=False) + + # Reset in-axis grouper columns + # (using level numbers `g_nums` because level names may not be unique) + output = output.reset_index(level=g_nums) + + return output.reset_index(drop=True) + GroupBy._add_numeric_operations() diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index 112f7629d735a..f24fa0daa5b18 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -1,3 +1,4 @@ +from collections import OrderedDict from datetime import datetime import numpy as np @@ -25,7 +26,7 @@ def f(a): ordered=a.ordered) return a - index = pd.MultiIndex.from_product(map(f, args), names=names) + index = MultiIndex.from_product(map(f, args), names=names) return result.reindex(index).sort_index() @@ -189,7 +190,7 @@ def test_level_get_group(observed): # GH15155 df = DataFrame(data=np.arange(2, 22, 2), index=MultiIndex( - levels=[pd.CategoricalIndex(["a", "b"]), range(10)], + levels=[CategoricalIndex(["a", "b"]), range(10)], codes=[[0] * 5 + [1] * 5, range(10)], names=["Index1", "Index2"])) g = df.groupby(level=["Index1"], observed=observed) @@ -197,7 +198,7 @@ def test_level_get_group(observed): # expected should equal test.loc[["a"]] # GH15166 expected = DataFrame(data=np.arange(2, 12, 2), - index=pd.MultiIndex(levels=[pd.CategoricalIndex( + index=MultiIndex(levels=[CategoricalIndex( ["a", "b"]), range(5)], codes=[[0] * 5, range(5)], names=["Index1", "Index2"])) @@ -265,7 +266,7 @@ def test_observed(observed): # multiple groupers with a non-cat gb = df.groupby(['A', 'B', 'C'], observed=observed) - exp_index = pd.MultiIndex.from_arrays( + exp_index = MultiIndex.from_arrays( [cat1, cat2, ['foo', 'bar'] * 2], names=['A', 'B', 'C']) expected = DataFrame({'values': Series( @@ -280,7 +281,7 @@ def test_observed(observed): tm.assert_frame_equal(result, expected) gb = df.groupby(['A', 'B'], observed=observed) - exp_index = pd.MultiIndex.from_arrays( + exp_index = MultiIndex.from_arrays( [cat1, cat2], names=['A', 'B']) expected = DataFrame({'values': [1, 2, 3, 4]}, @@ -296,25 +297,25 @@ def test_observed(observed): # https://github.com/pandas-dev/pandas/issues/8138 d = {'cat': - pd.Categorical(["a", "b", "a", "b"], categories=["a", "b", "c"], - ordered=True), + Categorical(["a", "b", "a", "b"], categories=["a", "b", "c"], + ordered=True), 'ints': [1, 1, 2, 2], 'val': [10, 20, 30, 40]} - df = pd.DataFrame(d) + df = DataFrame(d) # Grouping on a single column groups_single_key = df.groupby("cat", observed=observed) result = groups_single_key.mean() - exp_index = pd.CategoricalIndex(list('ab'), name="cat", - categories=list('abc'), - ordered=True) + exp_index = CategoricalIndex(list('ab'), name="cat", + categories=list('abc'), + ordered=True) expected = DataFrame({"ints": [1.5, 1.5], "val": [20., 30]}, index=exp_index) if not observed: - index = pd.CategoricalIndex(list('abc'), name="cat", - categories=list('abc'), - ordered=True) + index = CategoricalIndex(list('abc'), name="cat", + categories=list('abc'), + ordered=True) expected = expected.reindex(index) tm.assert_frame_equal(result, expected) @@ -324,9 +325,9 @@ def test_observed(observed): result = groups_double_key.agg('mean') expected = DataFrame( {"val": [10, 30, 20, 40], - "cat": pd.Categorical(['a', 'a', 'b', 'b'], - categories=['a', 'b', 'c'], - ordered=True), + "cat": Categorical(['a', 'a', 'b', 'b'], + categories=['a', 'b', 'c'], + ordered=True), "ints": [1, 2, 1, 2]}).set_index(["cat", "ints"]) if not observed: expected = cartesian_product_for_groupers( @@ -347,7 +348,7 @@ def test_observed(observed): # with as_index d = {'foo': [10, 8, 4, 8, 4, 1, 1], 'bar': [10, 20, 30, 40, 50, 60, 70], 'baz': ['d', 'c', 'e', 'a', 'a', 'd', 'c']} - df = pd.DataFrame(d) + df = DataFrame(d) cat = pd.cut(df['foo'], np.linspace(0, 10, 3)) df['range'] = cat groups = df.groupby(['range', 'baz'], as_index=False, observed=observed) @@ -360,7 +361,7 @@ def test_observed(observed): def test_observed_codes_remap(observed): d = {'C1': [3, 3, 4, 5], 'C2': [1, 2, 3, 4], 'C3': [10, 100, 200, 34]} - df = pd.DataFrame(d) + df = DataFrame(d) values = pd.cut(df['C1'], [1, 2, 3, 6]) values.name = "cat" groups_double_key = df.groupby([values, 'C2'], observed=observed) @@ -401,8 +402,8 @@ def test_observed_groups(observed): # gh-20583 # test that we have the appropriate groups - cat = pd.Categorical(['a', 'c', 'a'], categories=['a', 'b', 'c']) - df = pd.DataFrame({'cat': cat, 'vals': [1, 2, 3]}) + cat = Categorical(['a', 'c', 'a'], categories=['a', 'b', 'c']) + df = DataFrame({'cat': cat, 'vals': [1, 2, 3]}) g = df.groupby('cat', observed=observed) result = g.groups @@ -419,9 +420,9 @@ def test_observed_groups(observed): def test_observed_groups_with_nan(observed): # GH 24740 - df = pd.DataFrame({'cat': pd.Categorical(['a', np.nan, 'a'], - categories=['a', 'b', 'd']), - 'vals': [1, 2, 3]}) + df = DataFrame({'cat': Categorical(['a', np.nan, 'a'], + categories=['a', 'b', 'd']), + 'vals': [1, 2, 3]}) g = df.groupby('cat', observed=observed) result = g.groups if observed: @@ -435,16 +436,16 @@ def test_observed_groups_with_nan(observed): def test_dataframe_categorical_with_nan(observed): # GH 21151 - s1 = pd.Categorical([np.nan, 'a', np.nan, 'a'], - categories=['a', 'b', 'c']) - s2 = pd.Series([1, 2, 3, 4]) - df = pd.DataFrame({'s1': s1, 's2': s2}) + s1 = Categorical([np.nan, 'a', np.nan, 'a'], + categories=['a', 'b', 'c']) + s2 = Series([1, 2, 3, 4]) + df = DataFrame({'s1': s1, 's2': s2}) result = df.groupby('s1', observed=observed).first().reset_index() if observed: - expected = DataFrame({'s1': pd.Categorical(['a'], + expected = DataFrame({'s1': Categorical(['a'], categories=['a', 'b', 'c']), 's2': [2]}) else: - expected = DataFrame({'s1': pd.Categorical(['a', 'b', 'c'], + expected = DataFrame({'s1': Categorical(['a', 'b', 'c'], categories=['a', 'b', 'c']), 's2': [2, np.nan, np.nan]}) tm.assert_frame_equal(result, expected) @@ -459,11 +460,11 @@ def test_dataframe_categorical_ordered_observed_sort(ordered, observed, sort): # Build a dataframe with cat having one unobserved category ('missing'), # and a Series with identical values - label = pd.Categorical(['d', 'a', 'b', 'a', 'd', 'b'], - categories=['a', 'b', 'missing', 'd'], - ordered=ordered) - val = pd.Series(['d', 'a', 'b', 'a', 'd', 'b']) - df = pd.DataFrame({'label': label, 'val': val}) + label = Categorical(['d', 'a', 'b', 'a', 'd', 'b'], + categories=['a', 'b', 'missing', 'd'], + ordered=ordered) + val = Series(['d', 'a', 'b', 'a', 'd', 'b']) + df = DataFrame({'label': label, 'val': val}) # aggregate on the Categorical result = (df.groupby('label', observed=observed, sort=sort)['val'] @@ -471,8 +472,8 @@ def test_dataframe_categorical_ordered_observed_sort(ordered, observed, sort): # If ordering works, we expect index labels equal to aggregation results, # except for 'observed=False': label 'missing' has aggregation None - label = pd.Series(result.index.array, dtype='object') - aggr = pd.Series(result.array) + label = Series(result.index.array, dtype='object') + aggr = Series(result.array) if not observed: aggr[aggr.isna()] = 'missing' if not all(label == aggr): @@ -555,9 +556,9 @@ def test_categorical_index(): def test_describe_categorical_columns(): # GH 11558 - cats = pd.CategoricalIndex(['qux', 'foo', 'baz', 'bar'], - categories=['foo', 'bar', 'baz', 'qux'], - ordered=True) + cats = CategoricalIndex(['qux', 'foo', 'baz', 'bar'], + categories=['foo', 'bar', 'baz', 'qux'], + ordered=True) df = DataFrame(np.random.randn(20, 4), columns=cats) result = df.groupby([1, 2, 3, 4] * 5).describe() @@ -567,22 +568,22 @@ def test_describe_categorical_columns(): def test_unstack_categorical(): # GH11558 (example is taken from the original issue) - df = pd.DataFrame({'a': range(10), - 'medium': ['A', 'B'] * 5, - 'artist': list('XYXXY') * 2}) + df = DataFrame({'a': range(10), + 'medium': ['A', 'B'] * 5, + 'artist': list('XYXXY') * 2}) df['medium'] = df['medium'].astype('category') gcat = df.groupby( ['artist', 'medium'], observed=False)['a'].count().unstack() result = gcat.describe() - exp_columns = pd.CategoricalIndex(['A', 'B'], ordered=False, - name='medium') + exp_columns = CategoricalIndex(['A', 'B'], ordered=False, + name='medium') tm.assert_index_equal(result.columns, exp_columns) tm.assert_categorical_equal(result.columns.values, exp_columns.values) result = gcat['A'] + gcat['B'] - expected = pd.Series([6, 4], index=pd.Index(['X', 'Y'], name='artist')) + expected = Series([6, 4], index=Index(['X', 'Y'], name='artist')) tm.assert_series_equal(result, expected) @@ -644,22 +645,22 @@ def test_preserve_categories(): categories = list('abc') # ordered=True - df = DataFrame({'A': pd.Categorical(list('ba'), - categories=categories, - ordered=True)}) - index = pd.CategoricalIndex(categories, categories, ordered=True) + df = DataFrame({'A': Categorical(list('ba'), + categories=categories, + ordered=True)}) + index = CategoricalIndex(categories, categories, ordered=True) tm.assert_index_equal( df.groupby('A', sort=True, observed=False).first().index, index) tm.assert_index_equal( df.groupby('A', sort=False, observed=False).first().index, index) # ordered=False - df = DataFrame({'A': pd.Categorical(list('ba'), - categories=categories, - ordered=False)}) - sort_index = pd.CategoricalIndex(categories, categories, ordered=False) - nosort_index = pd.CategoricalIndex(list('bac'), list('bac'), - ordered=False) + df = DataFrame({'A': Categorical(list('ba'), + categories=categories, + ordered=False)}) + sort_index = CategoricalIndex(categories, categories, ordered=False) + nosort_index = CategoricalIndex(list('bac'), list('bac'), + ordered=False) tm.assert_index_equal( df.groupby('A', sort=True, observed=False).first().index, sort_index) @@ -857,94 +858,94 @@ def test_sort_datetimelike(): def test_empty_sum(): # https://github.com/pandas-dev/pandas/issues/18678 - df = pd.DataFrame({"A": pd.Categorical(['a', 'a', 'b'], - categories=['a', 'b', 'c']), - 'B': [1, 2, 1]}) - expected_idx = pd.CategoricalIndex(['a', 'b', 'c'], name='A') + df = DataFrame({"A": Categorical(['a', 'a', 'b'], + categories=['a', 'b', 'c']), + 'B': [1, 2, 1]}) + expected_idx = CategoricalIndex(['a', 'b', 'c'], name='A') # 0 by default result = df.groupby("A", observed=False).B.sum() - expected = pd.Series([3, 1, 0], expected_idx, name='B') + expected = Series([3, 1, 0], expected_idx, name='B') tm.assert_series_equal(result, expected) # min_count=0 result = df.groupby("A", observed=False).B.sum(min_count=0) - expected = pd.Series([3, 1, 0], expected_idx, name='B') + expected = Series([3, 1, 0], expected_idx, name='B') tm.assert_series_equal(result, expected) # min_count=1 result = df.groupby("A", observed=False).B.sum(min_count=1) - expected = pd.Series([3, 1, np.nan], expected_idx, name='B') + expected = Series([3, 1, np.nan], expected_idx, name='B') tm.assert_series_equal(result, expected) # min_count>1 result = df.groupby("A", observed=False).B.sum(min_count=2) - expected = pd.Series([3, np.nan, np.nan], expected_idx, name='B') + expected = Series([3, np.nan, np.nan], expected_idx, name='B') tm.assert_series_equal(result, expected) def test_empty_prod(): # https://github.com/pandas-dev/pandas/issues/18678 - df = pd.DataFrame({"A": pd.Categorical(['a', 'a', 'b'], - categories=['a', 'b', 'c']), - 'B': [1, 2, 1]}) + df = DataFrame({"A": Categorical(['a', 'a', 'b'], + categories=['a', 'b', 'c']), + 'B': [1, 2, 1]}) - expected_idx = pd.CategoricalIndex(['a', 'b', 'c'], name='A') + expected_idx = CategoricalIndex(['a', 'b', 'c'], name='A') # 1 by default result = df.groupby("A", observed=False).B.prod() - expected = pd.Series([2, 1, 1], expected_idx, name='B') + expected = Series([2, 1, 1], expected_idx, name='B') tm.assert_series_equal(result, expected) # min_count=0 result = df.groupby("A", observed=False).B.prod(min_count=0) - expected = pd.Series([2, 1, 1], expected_idx, name='B') + expected = Series([2, 1, 1], expected_idx, name='B') tm.assert_series_equal(result, expected) # min_count=1 result = df.groupby("A", observed=False).B.prod(min_count=1) - expected = pd.Series([2, 1, np.nan], expected_idx, name='B') + expected = Series([2, 1, np.nan], expected_idx, name='B') tm.assert_series_equal(result, expected) def test_groupby_multiindex_categorical_datetime(): # https://github.com/pandas-dev/pandas/issues/21390 - df = pd.DataFrame({ - 'key1': pd.Categorical(list('abcbabcba')), - 'key2': pd.Categorical( + df = DataFrame({ + 'key1': Categorical(list('abcbabcba')), + 'key2': Categorical( list(pd.date_range('2018-06-01 00', freq='1T', periods=3)) * 3), 'values': np.arange(9), }) result = df.groupby(['key1', 'key2']).mean() - idx = pd.MultiIndex.from_product( - [pd.Categorical(['a', 'b', 'c']), - pd.Categorical(pd.date_range('2018-06-01 00', freq='1T', periods=3))], + idx = MultiIndex.from_product( + [Categorical(['a', 'b', 'c']), + Categorical(pd.date_range('2018-06-01 00', freq='1T', periods=3))], names=['key1', 'key2']) - expected = pd.DataFrame( + expected = DataFrame( {'values': [0, 4, 8, 3, 4, 5, 6, np.nan, 2]}, index=idx) assert_frame_equal(result, expected) @pytest.mark.parametrize("as_index, expected", [ - (True, pd.Series( - index=pd.MultiIndex.from_arrays( - [pd.Series([1, 1, 2], dtype='category'), - [1, 2, 2]], names=['a', 'b'] + (True, Series( + index=MultiIndex.from_arrays( + [Series([1, 1, 2], dtype='category'), + [1, 2, 2]], names=['a', 'b'] ), data=[1, 2, 3], name='x' )), - (False, pd.DataFrame({ - 'a': pd.Series([1, 1, 2], dtype='category'), + (False, DataFrame({ + 'a': Series([1, 1, 2], dtype='category'), 'b': [1, 2, 2], 'x': [1, 2, 3] })) ]) def test_groupby_agg_observed_true_single_column(as_index, expected): # GH-23970 - df = pd.DataFrame({ - 'a': pd.Series([1, 1, 2], dtype='category'), + df = DataFrame({ + 'a': Series([1, 1, 2], dtype='category'), 'b': [1, 2, 2], 'x': [1, 2, 3] }) @@ -957,9 +958,92 @@ def test_groupby_agg_observed_true_single_column(as_index, expected): @pytest.mark.parametrize('fill_value', [None, np.nan, pd.NaT]) def test_shift(fill_value): - ct = pd.Categorical(['a', 'b', 'c', 'd'], - categories=['a', 'b', 'c', 'd'], ordered=False) - expected = pd.Categorical([None, 'a', 'b', 'c'], - categories=['a', 'b', 'c', 'd'], ordered=False) + ct = Categorical(['a', 'b', 'c', 'd'], + categories=['a', 'b', 'c', 'd'], ordered=False) + expected = Categorical([None, 'a', 'b', 'c'], + categories=['a', 'b', 'c', 'd'], ordered=False) res = ct.shift(1, fill_value=fill_value) assert_equal(res, expected) + + +@pytest.fixture +def df_cat(df): + """ + DataFrame with multiple categorical columns and a column of integers. + Shortened so as not to contain all possible combinations of categories. + Useful for testing `observed` kwarg functionality on GroupBy objects. + + Parameters + ---------- + df: DataFrame + Non-categorical, longer DataFrame from another fixture, used to derive + this one + + Returns + ------- + df_cat: DataFrame + """ + df_cat = df.copy()[:4] # leave out some groups + df_cat['A'] = df_cat['A'].astype('category') + df_cat['B'] = df_cat['B'].astype('category') + df_cat['C'] = Series([1, 2, 3, 4]) + df_cat = df_cat.drop(['D'], axis=1) + return df_cat + + +@pytest.mark.parametrize('operation, kwargs', [ + ('agg', dict(dtype='category')), + ('apply', dict())]) +def test_seriesgroupby_observed_true(df_cat, operation, kwargs): + # GH 24880 + index = MultiIndex.from_frame( + DataFrame({'A': ['foo', 'foo', 'bar', 'bar'], + 'B': ['one', 'two', 'one', 'three'] + }, **kwargs)) + expected = Series(data=[1, 3, 2, 4], index=index, name='C') + grouped = df_cat.groupby(['A', 'B'], observed=True)['C'] + result = getattr(grouped, operation)(sum) + assert_series_equal(result, expected) + + +@pytest.mark.parametrize('operation', ['agg', 'apply']) +@pytest.mark.parametrize('observed', [False, None]) +def test_seriesgroupby_observed_false_or_none(df_cat, observed, operation): + # GH 24880 + index, _ = MultiIndex.from_product( + [CategoricalIndex(['bar', 'foo'], ordered=False), + CategoricalIndex(['one', 'three', 'two'], ordered=False)], + names=['A', 'B']).sortlevel() + + expected = Series(data=[2, 4, np.nan, 1, np.nan, 3], + index=index, name='C') + grouped = df_cat.groupby(['A', 'B'], observed=observed)['C'] + result = getattr(grouped, operation)(sum) + assert_series_equal(result, expected) + + +@pytest.mark.parametrize("observed, index, data", [ + (True, MultiIndex.from_tuples( + [('foo', 'one', 'min'), ('foo', 'one', 'max'), + ('foo', 'two', 'min'), ('foo', 'two', 'max'), + ('bar', 'one', 'min'), ('bar', 'one', 'max'), + ('bar', 'three', 'min'), ('bar', 'three', 'max')], + names=['A', 'B', None]), [1, 1, 3, 3, 2, 2, 4, 4]), + (False, MultiIndex.from_product( + [CategoricalIndex(['bar', 'foo'], ordered=False), + CategoricalIndex(['one', 'three', 'two'], ordered=False), + Index(['min', 'max'])], + names=['A', 'B', None]), + [2, 2, 4, 4, np.nan, np.nan, 1, 1, np.nan, np.nan, 3, 3]), + (None, MultiIndex.from_product( + [CategoricalIndex(['bar', 'foo'], ordered=False), + CategoricalIndex(['one', 'three', 'two'], ordered=False), + Index(['min', 'max'])], + names=['A', 'B', None]), + [2, 2, 4, 4, np.nan, np.nan, 1, 1, np.nan, np.nan, 3, 3])]) +def test_seriesgroupby_observed_apply_dict(df_cat, observed, index, data): + # GH 24880 + expected = Series(data=data, index=index, name='C') + result = df_cat.groupby(['A', 'B'], observed=observed)['C'].apply( + lambda x: OrderedDict([('min', x.min()), ('max', x.max())])) + assert_series_equal(result, expected) From 8154efb0c1a64295cf54e00025b4ab09bcd02752 Mon Sep 17 00:00:00 2001 From: William Ayd Date: Wed, 29 May 2019 18:49:16 -0700 Subject: [PATCH 07/51] Remove Unnecessary Subclasses from test_excel (#26553) --- pandas/io/excel/_base.py | 2 +- pandas/tests/io/test_excel.py | 42 ++++++++++++++++------------------- 2 files changed, 20 insertions(+), 24 deletions(-) diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py index 3af6be7a371e7..24412b26b021b 100644 --- a/pandas/io/excel/_base.py +++ b/pandas/io/excel/_base.py @@ -591,7 +591,7 @@ class ExcelWriter(metaclass=abc.ABCMeta): def __new__(cls, path, engine=None, **kwargs): # only switch class if generic(ExcelWriter) - if issubclass(cls, ExcelWriter): + if cls is ExcelWriter: if engine is None or (isinstance(engine, str) and engine == 'auto'): if isinstance(path, str): diff --git a/pandas/tests/io/test_excel.py b/pandas/tests/io/test_excel.py index 6db3d1d4ab34d..1421fc94b67f4 100644 --- a/pandas/tests/io/test_excel.py +++ b/pandas/tests/io/test_excel.py @@ -1995,11 +1995,10 @@ def test_path_local_path(self, merge_cells, engine, ext): @td.skip_if_no('openpyxl') -@pytest.mark.parametrize("merge_cells,ext,engine", [ - (None, '.xlsx', 'openpyxl')]) -class TestOpenpyxlTests(_WriterBase): +@pytest.mark.parametrize("ext", ['.xlsx']) +class TestOpenpyxlTests: - def test_to_excel_styleconverter(self, merge_cells, ext, engine): + def test_to_excel_styleconverter(self, ext): from openpyxl import styles hstyle = { @@ -2053,7 +2052,7 @@ def test_to_excel_styleconverter(self, merge_cells, ext, engine): assert kw['number_format'] == number_format assert kw['protection'] == protection - def test_write_cells_merge_styled(self, merge_cells, ext, engine): + def test_write_cells_merge_styled(self, ext): from pandas.io.formats.excel import ExcelCell sheet_name = 'merge_styled' @@ -2087,7 +2086,7 @@ def test_write_cells_merge_styled(self, merge_cells, ext, engine): @pytest.mark.parametrize("mode,expected", [ ('w', ['baz']), ('a', ['foo', 'bar', 'baz'])]) - def test_write_append_mode(self, merge_cells, ext, engine, mode, expected): + def test_write_append_mode(self, ext, mode, expected): import openpyxl df = DataFrame([1], columns=['baz']) @@ -2099,7 +2098,7 @@ def test_write_append_mode(self, merge_cells, ext, engine, mode, expected): wb.worksheets[1]['A1'].value = 'bar' wb.save(f) - writer = ExcelWriter(f, engine=engine, mode=mode) + writer = ExcelWriter(f, engine='openpyxl', mode=mode) df.to_excel(writer, sheet_name='baz', index=False) writer.save() @@ -2112,12 +2111,11 @@ def test_write_append_mode(self, merge_cells, ext, engine, mode, expected): @td.skip_if_no('xlwt') -@pytest.mark.parametrize("merge_cells,ext,engine", [ - (None, '.xls', 'xlwt')]) -class TestXlwtTests(_WriterBase): +@pytest.mark.parametrize("ext,", ['.xls']) +class TestXlwtTests: def test_excel_raise_error_on_multiindex_columns_and_no_index( - self, merge_cells, ext, engine): + self, ext): # MultiIndex as columns is not yet implemented 9794 cols = MultiIndex.from_tuples([('site', ''), ('2014', 'height'), @@ -2127,8 +2125,7 @@ def test_excel_raise_error_on_multiindex_columns_and_no_index( with ensure_clean(ext) as path: df.to_excel(path, index=False) - def test_excel_multiindex_columns_and_index_true(self, merge_cells, ext, - engine): + def test_excel_multiindex_columns_and_index_true(self, ext): cols = MultiIndex.from_tuples([('site', ''), ('2014', 'height'), ('2014', 'weight')]) @@ -2136,7 +2133,7 @@ def test_excel_multiindex_columns_and_index_true(self, merge_cells, ext, with ensure_clean(ext) as path: df.to_excel(path, index=True) - def test_excel_multiindex_index(self, merge_cells, ext, engine): + def test_excel_multiindex_index(self, ext): # MultiIndex as index works so assert no error #9794 cols = MultiIndex.from_tuples([('site', ''), ('2014', 'height'), @@ -2145,7 +2142,7 @@ def test_excel_multiindex_index(self, merge_cells, ext, engine): with ensure_clean(ext) as path: df.to_excel(path, index=False) - def test_to_excel_styleconverter(self, merge_cells, ext, engine): + def test_to_excel_styleconverter(self, ext): import xlwt hstyle = {"font": {"bold": True}, @@ -2164,21 +2161,20 @@ def test_to_excel_styleconverter(self, merge_cells, ext, engine): assert xlwt.Alignment.HORZ_CENTER == xls_style.alignment.horz assert xlwt.Alignment.VERT_TOP == xls_style.alignment.vert - def test_write_append_mode_raises(self, merge_cells, ext, engine): + def test_write_append_mode_raises(self, ext): msg = "Append mode is not supported with xlwt!" with ensure_clean(ext) as f: with pytest.raises(ValueError, match=msg): - ExcelWriter(f, engine=engine, mode='a') + ExcelWriter(f, engine='xlwt', mode='a') @td.skip_if_no('xlsxwriter') -@pytest.mark.parametrize("merge_cells,ext,engine", [ - (None, '.xlsx', 'xlsxwriter')]) -class TestXlsxWriterTests(_WriterBase): +@pytest.mark.parametrize("ext", ['.xlsx']) +class TestXlsxWriterTests: @td.skip_if_no('openpyxl') - def test_column_format(self, merge_cells, ext, engine): + def test_column_format(self, ext): # Test that column formats are applied to cells. Test for issue #9167. # Applicable to xlsxwriter only. with warnings.catch_warnings(): @@ -2222,12 +2218,12 @@ def test_column_format(self, merge_cells, ext, engine): assert read_num_format == num_format - def test_write_append_mode_raises(self, merge_cells, ext, engine): + def test_write_append_mode_raises(self, ext): msg = "Append mode is not supported with xlsxwriter!" with ensure_clean(ext) as f: with pytest.raises(ValueError, match=msg): - ExcelWriter(f, engine=engine, mode='a') + ExcelWriter(f, engine='xlsxwriter', mode='a') class TestExcelWriterEngineTests: From a60d1bd45a99519fad5024068db956e0aa1cc6a1 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Thu, 30 May 2019 14:15:18 +0100 Subject: [PATCH 08/51] DEPR: remove Panel-specific parts of core.indexing (#25567) --- pandas/_libs/indexing.pyx | 4 + pandas/core/indexing.py | 88 ++----------------- pandas/tests/indexing/test_indexing.py | 113 +++++++++++++++++++++++++ 3 files changed, 123 insertions(+), 82 deletions(-) diff --git a/pandas/_libs/indexing.pyx b/pandas/_libs/indexing.pyx index 6e62978c8477f..308e914b7b5b7 100644 --- a/pandas/_libs/indexing.pyx +++ b/pandas/_libs/indexing.pyx @@ -17,4 +17,8 @@ cdef class _NDFrameIndexerBase: ndim = self._ndim if ndim is None: ndim = self._ndim = self.obj.ndim + if ndim > 2: + msg = ("NDFrameIndexer does not support NDFrame objects with" + " ndim > 2") + raise ValueError(msg) return ndim diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index 93e56834b62f6..86158fa9ee529 100755 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -11,7 +11,7 @@ from pandas.core.dtypes.common import ( ensure_platform_int, is_float, is_integer, is_integer_dtype, is_iterator, is_list_like, is_numeric_dtype, is_scalar, is_sequence, is_sparse) -from pandas.core.dtypes.generic import ABCDataFrame, ABCPanel, ABCSeries +from pandas.core.dtypes.generic import ABCDataFrame, ABCSeries from pandas.core.dtypes.missing import _infer_fill_value, isna import pandas.core.common as com @@ -450,10 +450,6 @@ def _setitem_with_indexer(self, indexer, value): self.obj._maybe_update_cacher(clear=True) return self.obj - # set using setitem (Panel and > dims) - elif self.ndim >= 3: - return self.obj.__setitem__(indexer, value) - # set item_labels = self.obj._get_axis(info_axis) @@ -642,9 +638,6 @@ def can_do_equal_len(): elif isinstance(value, ABCDataFrame): value = self._align_frame(indexer, value) - if isinstance(value, ABCPanel): - value = self._align_panel(indexer, value) - # check for chained assignment self.obj._check_is_chained_assignment_possible() @@ -690,7 +683,6 @@ def ravel(i): sum_aligners = sum(aligners) single_aligner = sum_aligners == 1 is_frame = self.obj.ndim == 2 - is_panel = self.obj.ndim >= 3 obj = self.obj # are we a single alignable value on a non-primary @@ -702,11 +694,6 @@ def ravel(i): if is_frame: single_aligner = single_aligner and aligners[0] - # panel - elif is_panel: - single_aligner = (single_aligner and - (aligners[1] or aligners[2])) - # we have a frame, with multiple indexers on both axes; and a # series, so need to broadcast (see GH5206) if (sum_aligners == self.ndim and @@ -738,7 +725,7 @@ def ravel(i): return ser.reindex(new_ix)._values # 2 dims - elif single_aligner and is_frame: + elif single_aligner: # reindex along index ax = self.obj.axes[1] @@ -746,30 +733,6 @@ def ravel(i): return ser._values.copy() return ser.reindex(ax)._values - # >2 dims - elif single_aligner: - - broadcast = [] - for n, labels in enumerate(self.obj._get_plane_axes(i)): - - # reindex along the matching dimensions - if len(labels & ser.index): - ser = ser.reindex(labels) - else: - broadcast.append((n, len(labels))) - - # broadcast along other dims - ser = ser._values.copy() - for (axis, l) in broadcast: - shape = [-1] * (len(broadcast) + 1) - shape[axis] = l - ser = np.tile(ser, l).reshape(shape) - - if self.obj.ndim == 3: - ser = ser.T - - return ser - elif is_scalar(indexer): ax = self.obj._get_axis(1) @@ -782,7 +745,6 @@ def ravel(i): def _align_frame(self, indexer, df): is_frame = self.obj.ndim == 2 - is_panel = self.obj.ndim >= 3 if isinstance(indexer, tuple): @@ -802,21 +764,6 @@ def _align_frame(self, indexer, df): else: sindexers.append(i) - # panel - if is_panel: - - # need to conform to the convention - # as we are not selecting on the items axis - # and we have a single indexer - # GH 7763 - if len(sindexers) == 1 and sindexers[0] != 0: - df = df.T - - if idx is None: - idx = df.index - if cols is None: - cols = df.columns - if idx is not None and cols is not None: if df.index.equals(idx) and df.columns.equals(cols): @@ -843,24 +790,8 @@ def _align_frame(self, indexer, df): val = df.reindex(index=ax)._values return val - elif is_scalar(indexer) and is_panel: - idx = self.obj.axes[1] - cols = self.obj.axes[2] - - # by definition we are indexing on the 0th axis - # a passed in dataframe which is actually a transpose - # of what is needed - if idx.equals(df.index) and cols.equals(df.columns): - return df.copy()._values - - return df.reindex(idx, columns=cols)._values - raise ValueError('Incompatible indexer with DataFrame') - def _align_panel(self, indexer, df): - raise NotImplementedError("cannot set using an indexer with a Panel " - "yet!") - def _getitem_tuple(self, tup): try: return self._getitem_lowerdim(tup) @@ -1059,13 +990,6 @@ def _getitem_nested_tuple(self, tup): # has the dim of the obj changed? # GH 7199 if obj.ndim < current_ndim: - - # GH 7516 - # if had a 3 dim and are going to a 2d - # axes are reversed on a DataFrame - if i >= 1 and current_ndim == 3 and obj.ndim == 2: - obj = obj.T - axis -= 1 return obj @@ -1562,8 +1486,8 @@ class _LocIndexer(_LocationIndexer): - A boolean array of the same length as the axis being sliced, e.g. ``[True, False, True]``. - - A ``callable`` function with one argument (the calling Series, DataFrame - or Panel) and that returns valid output for indexing (one of the above) + - A ``callable`` function with one argument (the calling Series or + DataFrame) and that returns valid output for indexing (one of the above) See more at :ref:`Selection by Label ` @@ -1931,8 +1855,8 @@ class _iLocIndexer(_LocationIndexer): - A list or array of integers, e.g. ``[4, 3, 0]``. - A slice object with ints, e.g. ``1:7``. - A boolean array. - - A ``callable`` function with one argument (the calling Series, DataFrame - or Panel) and that returns valid output for indexing (one of the above). + - A ``callable`` function with one argument (the calling Series or + DataFrame) and that returns valid output for indexing (one of the above). This is useful in method chains, when you don't have a reference to the calling object, but would like to base your selection on some value. diff --git a/pandas/tests/indexing/test_indexing.py b/pandas/tests/indexing/test_indexing.py index 22f6855717e80..a0e3df182b129 100644 --- a/pandas/tests/indexing/test_indexing.py +++ b/pandas/tests/indexing/test_indexing.py @@ -11,11 +11,14 @@ import pandas as pd from pandas import DataFrame, Index, NaT, Series +from pandas.core.generic import NDFrame from pandas.core.indexing import ( _maybe_numeric_slice, _non_reducing_slice, validate_indices) from pandas.tests.indexing.common import Base, _mklbl import pandas.util.testing as tm +ignore_ix = pytest.mark.filterwarnings("ignore:\\n.ix:FutureWarning") + # ------------------------------------------------------------------------ # Indexing test cases @@ -53,6 +56,93 @@ def test_setitem_ndarray_1d(self): with pytest.raises(ValueError): df[2:5] = np.arange(1, 4) * 1j + @pytest.mark.parametrize('index', tm.all_index_generator(5), + ids=lambda x: type(x).__name__) + @pytest.mark.parametrize('obj', [ + lambda i: Series(np.arange(len(i)), index=i), + lambda i: DataFrame( + np.random.randn(len(i), len(i)), index=i, columns=i) + ], ids=['Series', 'DataFrame']) + @pytest.mark.parametrize('idxr, idxr_id', [ + (lambda x: x, 'getitem'), + (lambda x: x.loc, 'loc'), + (lambda x: x.iloc, 'iloc'), + pytest.param(lambda x: x.ix, 'ix', marks=ignore_ix) + ]) + def test_getitem_ndarray_3d(self, index, obj, idxr, idxr_id): + # GH 25567 + obj = obj(index) + idxr = idxr(obj) + nd3 = np.random.randint(5, size=(2, 2, 2)) + + msg = (r"Buffer has wrong number of dimensions \(expected 1," + r" got 3\)|" + "The truth value of an array with more than one element is" + " ambiguous|" + "Cannot index with multidimensional key|" + r"Wrong number of dimensions. values.ndim != ndim \[3 != 1\]|" + "unhashable type: 'numpy.ndarray'" # TypeError + ) + + if (isinstance(obj, Series) and idxr_id == 'getitem' + and index.inferred_type in [ + 'string', 'datetime64', 'period', 'timedelta64', + 'boolean', 'categorical']): + idxr[nd3] + else: + if (isinstance(obj, DataFrame) and idxr_id == 'getitem' + and index.inferred_type == 'boolean'): + error = TypeError + else: + error = ValueError + + with pytest.raises(error, match=msg): + idxr[nd3] + + @pytest.mark.parametrize('index', tm.all_index_generator(5), + ids=lambda x: type(x).__name__) + @pytest.mark.parametrize('obj', [ + lambda i: Series(np.arange(len(i)), index=i), + lambda i: DataFrame( + np.random.randn(len(i), len(i)), index=i, columns=i) + ], ids=['Series', 'DataFrame']) + @pytest.mark.parametrize('idxr, idxr_id', [ + (lambda x: x, 'setitem'), + (lambda x: x.loc, 'loc'), + (lambda x: x.iloc, 'iloc'), + pytest.param(lambda x: x.ix, 'ix', marks=ignore_ix) + ]) + def test_setitem_ndarray_3d(self, index, obj, idxr, idxr_id): + # GH 25567 + obj = obj(index) + idxr = idxr(obj) + nd3 = np.random.randint(5, size=(2, 2, 2)) + + msg = (r"Buffer has wrong number of dimensions \(expected 1," + r" got 3\)|" + "The truth value of an array with more than one element is" + " ambiguous|" + "Only 1-dimensional input arrays are supported|" + "'pandas._libs.interval.IntervalTree' object has no attribute" + " 'set_value'|" # AttributeError + "unhashable type: 'numpy.ndarray'|" # TypeError + r"^\[\[\[" # pandas.core.indexing.IndexingError + ) + + if ((idxr_id == 'iloc') + or ((isinstance(obj, Series) and idxr_id == 'setitem' + and index.inferred_type in [ + 'floating', 'string', 'datetime64', 'period', 'timedelta64', + 'boolean', 'categorical'])) + or (idxr_id == 'ix' and index.inferred_type in [ + 'string', 'datetime64', 'period', 'boolean'])): + idxr[nd3] = 0 + else: + with pytest.raises( + (ValueError, AttributeError, TypeError, + pd.core.indexing.IndexingError), match=msg): + idxr[nd3] = 0 + def test_inf_upcast(self): # GH 16957 # We should be able to use np.inf as a key @@ -1015,3 +1105,26 @@ def test_extension_array_cross_section_converts(): result = df.iloc[0] tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize('idxr, error, error_message', [ + (lambda x: x, + AttributeError, + "'numpy.ndarray' object has no attribute 'get'"), + (lambda x: x.loc, + AttributeError, + "type object 'NDFrame' has no attribute '_AXIS_ALIASES'"), + (lambda x: x.iloc, + AttributeError, + "type object 'NDFrame' has no attribute '_AXIS_ALIASES'"), + pytest.param( + lambda x: x.ix, + ValueError, + "NDFrameIndexer does not support NDFrame objects with ndim > 2", + marks=ignore_ix) +]) +def test_ndframe_indexing_raises(idxr, error, error_message): + # GH 25567 + frame = NDFrame(np.random.randint(5, size=(2, 2, 2))) + with pytest.raises(error, match=error_message): + idxr(frame)[0] From 7c8041b9b6dd44a7388bc8518dc0cd2f7303c2d2 Mon Sep 17 00:00:00 2001 From: Pauli Virtanen Date: Thu, 30 May 2019 19:16:57 +0000 Subject: [PATCH 09/51] PERF/CI: fix benchmark import error + run asv check on all builds (#26575) * PERF: fix asv import error * CI: run asv check on all builds * PERF: since TimeGrouper was removed, remove benchmarks concerning it * PERF: fix benchmark frame_methods.Iteration.mem_itertuples_to_list The runtime of the benchmark increased in asv 0.4 (which has upgraded asv.extern.asizeof), so bump the timeout upward. --- asv_bench/benchmarks/frame_methods.py | 2 ++ asv_bench/benchmarks/groupby.py | 7 +------ asv_bench/benchmarks/io/parsers.py | 8 ++++++-- azure-pipelines.yml | 3 ++- 4 files changed, 11 insertions(+), 9 deletions(-) diff --git a/asv_bench/benchmarks/frame_methods.py b/asv_bench/benchmarks/frame_methods.py index 0c1d861ce0839..5b76eeba115a4 100644 --- a/asv_bench/benchmarks/frame_methods.py +++ b/asv_bench/benchmarks/frame_methods.py @@ -96,6 +96,8 @@ def time_dict_rename_both_axes(self): class Iteration: + # mem_itertuples_* benchmarks are slow + timeout = 120 def setup(self): N = 1000 diff --git a/asv_bench/benchmarks/groupby.py b/asv_bench/benchmarks/groupby.py index 4dfce079dd09c..3097ada6d2022 100644 --- a/asv_bench/benchmarks/groupby.py +++ b/asv_bench/benchmarks/groupby.py @@ -1,12 +1,11 @@ from functools import partial from itertools import product from string import ascii_letters -import warnings import numpy as np from pandas import ( - Categorical, DataFrame, MultiIndex, Series, TimeGrouper, Timestamp, + Categorical, DataFrame, MultiIndex, Series, Timestamp, date_range, period_range) import pandas.util.testing as tm @@ -301,10 +300,6 @@ def setup(self): def time_multi_size(self): self.df.groupby(['key1', 'key2']).size() - def time_dt_timegrouper_size(self): - with warnings.catch_warnings(record=True): - self.df.groupby(TimeGrouper(key='dates', freq='M')).size() - def time_category_size(self): self.draws.groupby(self.cats).size() diff --git a/asv_bench/benchmarks/io/parsers.py b/asv_bench/benchmarks/io/parsers.py index 493955d394443..edba0358c821a 100644 --- a/asv_bench/benchmarks/io/parsers.py +++ b/asv_bench/benchmarks/io/parsers.py @@ -1,7 +1,11 @@ import numpy as np -from pandas._libs.tslibs.parsing import ( - _concat_date_cols, _does_string_look_like_datetime) +try: + from pandas._libs.tslibs.parsing import ( + _concat_date_cols, _does_string_look_like_datetime) +except ImportError: + # Avoid whole benchmark suite import failure on asv (currently 0.4) + pass class DoesStringLookLikeDatetime(object): diff --git a/azure-pipelines.yml b/azure-pipelines.yml index eee38dadfab90..17eaee5458af8 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -97,10 +97,11 @@ jobs: - script: | export PATH=$HOME/miniconda3/bin:$PATH source activate pandas-dev + cd asv_bench + asv check -E existing git remote add upstream https://github.com/pandas-dev/pandas.git git fetch upstream if git diff upstream/master --name-only | grep -q "^asv_bench/"; then - cd asv_bench asv machine --yes ASV_OUTPUT="$(asv dev)" if [[ $(echo "$ASV_OUTPUT" | grep "failed") ]]; then From 4c54dd298692783f417cbaa57d5fc1c0dc1f7c72 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Thu, 30 May 2019 20:51:03 +0100 Subject: [PATCH 10/51] TST: update tests\plotting\test_frame.py for mpl 3.1.0 (#26577) --- pandas/plotting/_compat.py | 1 + pandas/tests/plotting/test_frame.py | 7 ++++++- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/pandas/plotting/_compat.py b/pandas/plotting/_compat.py index 4077bef8f36f5..36bbe0f4ec174 100644 --- a/pandas/plotting/_compat.py +++ b/pandas/plotting/_compat.py @@ -17,3 +17,4 @@ def inner(): _mpl_ge_2_2_3 = _mpl_version('2.2.3', operator.ge) _mpl_ge_3_0_0 = _mpl_version('3.0.0', operator.ge) +_mpl_ge_3_1_0 = _mpl_version('3.1.0', operator.ge) diff --git a/pandas/tests/plotting/test_frame.py b/pandas/tests/plotting/test_frame.py index aede84ac831a6..f42f86540e46b 100644 --- a/pandas/tests/plotting/test_frame.py +++ b/pandas/tests/plotting/test_frame.py @@ -23,6 +23,7 @@ from pandas.io.formats.printing import pprint_thing import pandas.plotting as plotting +from pandas.plotting._compat import _mpl_ge_3_1_0 @td.skip_if_no_mpl @@ -68,7 +69,11 @@ def test_plot(self): self._check_axes_shape(axes, axes_num=4, layout=(4, 1)) df = DataFrame({'x': [1, 2], 'y': [3, 4]}) - with pytest.raises(AttributeError, match='Unknown property blarg'): + if _mpl_ge_3_1_0(): + msg = "'Line2D' object has no property 'blarg'" + else: + msg = "Unknown property blarg" + with pytest.raises(AttributeError, match=msg): df.plot.line(blarg=True) df = DataFrame(np.random.rand(10, 3), From 0041935572774c6599dd9b48e9acc7cceb559004 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Thu, 30 May 2019 22:40:36 -0500 Subject: [PATCH 11/51] Revert test_constructors xfail (#26586) Reverts https://github.com/pandas-dev/pandas/pull/26548 xref https://github.com/numpy/numpy/pull/13663 Closes https://github.com/pandas-dev/pandas/issues/26546 --- pandas/tests/frame/test_constructors.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index f371f4e93a29e..68017786eb6a6 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -15,7 +15,7 @@ import pandas as pd from pandas import ( Categorical, DataFrame, Index, MultiIndex, RangeIndex, Series, Timedelta, - Timestamp, compat, date_range, isna) + Timestamp, date_range, isna) from pandas.tests.frame.common import TestData import pandas.util.testing as tm @@ -113,7 +113,6 @@ def test_constructor_dtype_list_data(self): assert df.loc[1, 0] is None assert df.loc[0, 1] == '2' - @pytest.mark.xfail(compat.numpy._is_numpy_dev, reason="GH-26546") def test_constructor_list_frames(self): # see gh-3243 result = DataFrame([DataFrame()]) From 7f318658b92155678b31780722277d1f8c8df569 Mon Sep 17 00:00:00 2001 From: lrjball <50599110+lrjball@users.noreply.github.com> Date: Fri, 31 May 2019 13:41:10 +0100 Subject: [PATCH 12/51] DOC: Fixed redirects in various parts of the documentation (#26497) --- pandas/core/arrays/categorical.py | 3 ++- pandas/core/arrays/interval.py | 2 +- pandas/core/dtypes/concat.py | 2 +- pandas/core/generic.py | 10 +++++----- pandas/core/groupby/groupby.py | 2 +- pandas/core/groupby/grouper.py | 2 +- pandas/core/indexes/datetimes.py | 8 ++++---- pandas/core/indexes/interval.py | 2 +- pandas/core/indexes/multi.py | 3 ++- pandas/core/indexes/period.py | 2 +- pandas/core/indexes/timedeltas.py | 4 ++-- pandas/core/indexing.py | 4 ++-- pandas/core/reshape/concat.py | 2 +- pandas/core/tools/datetimes.py | 2 +- pandas/core/window.py | 6 +++--- pandas/io/json/json.py | 4 ++-- pandas/io/parsers.py | 4 ++-- pandas/io/pytables.py | 6 +++--- 18 files changed, 35 insertions(+), 33 deletions(-) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 0fa705369908a..89b86c66d7b05 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -272,7 +272,8 @@ class Categorical(ExtensionArray, PandasObject): Notes ----- See the `user guide - `_ for more. + `_ + for more. Examples -------- diff --git a/pandas/core/arrays/interval.py b/pandas/core/arrays/interval.py index 94b9dc8ebab55..4f628eff43167 100644 --- a/pandas/core/arrays/interval.py +++ b/pandas/core/arrays/interval.py @@ -95,7 +95,7 @@ Notes ----- See the `user guide -`_ +`_ for more. %(examples)s\ diff --git a/pandas/core/dtypes/concat.py b/pandas/core/dtypes/concat.py index f8488b7a153e3..b22ed45642cf6 100644 --- a/pandas/core/dtypes/concat.py +++ b/pandas/core/dtypes/concat.py @@ -244,7 +244,7 @@ def union_categoricals(to_union, sort_categories=False, ignore_order=False): ----- To learn more about categories, see `link - `__ + `__ Examples -------- diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 87db069d94893..0596d0ab844ec 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -3328,8 +3328,8 @@ def _check_setitem_copy(self, stacklevel=4, t='setting', force=False): "A value is trying to be set on a copy of a slice from a " "DataFrame\n\n" "See the caveats in the documentation: " - "http://pandas.pydata.org/pandas-docs/stable/" - "indexing.html#indexing-view-versus-copy" + "http://pandas.pydata.org/pandas-docs/stable/user_guide/" + "indexing.html#returning-a-view-versus-a-copy" ) else: @@ -3338,8 +3338,8 @@ def _check_setitem_copy(self, stacklevel=4, t='setting', force=False): "DataFrame.\n" "Try using .loc[row_indexer,col_indexer] = value " "instead\n\nSee the caveats in the documentation: " - "http://pandas.pydata.org/pandas-docs/stable/" - "indexing.html#indexing-view-versus-copy" + "http://pandas.pydata.org/pandas-docs/stable/user_guide/" + "indexing.html#returning-a-view-versus-a-copy" ) if value == 'raise': @@ -7762,7 +7762,7 @@ def asfreq(self, freq, method=None, how=None, normalize=False, Notes ----- To learn more about the frequency strings, please see `this link - `__. + `__. Examples -------- diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 91bb71a1a8af7..2b190c53da53d 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -219,7 +219,7 @@ class providing the base-class of operations. Notes ----- See more `here -`_ +`_ Examples -------- diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py index 04d407ebc670d..febfdc7bdf908 100644 --- a/pandas/core/groupby/grouper.py +++ b/pandas/core/groupby/grouper.py @@ -49,7 +49,7 @@ class Grouper: This will groupby the specified frequency if the target selection (via key or level) is a datetime-like object. For full specification of available frequencies, please see `here - `_. + `_. axis : number/name of the axis, defaults to 0 sort : boolean, default to False whether to sort the resulting labels diff --git a/pandas/core/indexes/datetimes.py b/pandas/core/indexes/datetimes.py index e68431b79dcd3..1bf3cb86811cb 100644 --- a/pandas/core/indexes/datetimes.py +++ b/pandas/core/indexes/datetimes.py @@ -215,7 +215,7 @@ class DatetimeIndex(DatetimeIndexOpsMixin, Int64Index, DatetimeDelegateMixin): Notes ----- To learn more about the frequency strings, please see `this link - `__. + `__. Creating a DatetimeIndex based on `start`, `periods`, and `end` has been deprecated in favor of :func:`date_range`. @@ -1377,7 +1377,7 @@ def date_range(start=None, end=None, periods=None, freq=None, tz=None, ``start`` and ``end`` (closed on both sides). To learn more about the frequency strings, please see `this link - `__. + `__. Examples -------- @@ -1533,7 +1533,7 @@ def bdate_range(start=None, end=None, periods=None, freq='B', tz=None, desired. To learn more about the frequency strings, please see `this link - `__. + `__. Examples -------- @@ -1605,7 +1605,7 @@ def cdate_range(start=None, end=None, periods=None, freq='C', tz=None, must be specified. To learn more about the frequency strings, please see `this link - `__. + `__. Returns ------- diff --git a/pandas/core/indexes/interval.py b/pandas/core/indexes/interval.py index 53e1a36c48994..41cf23c5542a9 100644 --- a/pandas/core/indexes/interval.py +++ b/pandas/core/indexes/interval.py @@ -1215,7 +1215,7 @@ def interval_range(start=None, end=None, periods=None, freq=None, ``start`` and ``end``, inclusively. To learn more about datetime-like frequency strings, please see `this link - `__. + `__. Examples -------- diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index f1553d9db835f..ec2cc70d1a352 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -182,7 +182,8 @@ class MultiIndex(Index): Notes ----- See the `user guide - `_ for more. + `_ + for more. Examples -------- diff --git a/pandas/core/indexes/period.py b/pandas/core/indexes/period.py index 64272431cf703..b20b0c6f853d9 100644 --- a/pandas/core/indexes/period.py +++ b/pandas/core/indexes/period.py @@ -939,7 +939,7 @@ def period_range(start=None, end=None, periods=None, freq=None, name=None): must be specified. To learn more about the frequency strings, please see `this link - `__. + `__. Examples -------- diff --git a/pandas/core/indexes/timedeltas.py b/pandas/core/indexes/timedeltas.py index 6ae17e62b49c6..0574a4b41c920 100644 --- a/pandas/core/indexes/timedeltas.py +++ b/pandas/core/indexes/timedeltas.py @@ -141,7 +141,7 @@ class TimedeltaIndex(DatetimeIndexOpsMixin, dtl.TimelikeOps, Int64Index, Notes ----- To learn more about the frequency strings, please see `this link - `__. + `__. Creating a TimedeltaIndex based on `start`, `periods`, and `end` has been deprecated in favor of :func:`timedelta_range`. @@ -730,7 +730,7 @@ def timedelta_range(start=None, end=None, periods=None, freq=None, ``start`` and ``end`` (closed on both sides). To learn more about the frequency strings, please see `this link - `__. + `__. Examples -------- diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index 86158fa9ee529..7f4827be6dff7 100755 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -1190,7 +1190,7 @@ def _validate_read_indexer(self, key, indexer, axis, raise_missing=False): KeyError in the future, you can use .reindex() as an alternative. See the documentation here: - https://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike""") # noqa + https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#deprecate-loc-reindex-listlike""") # noqa if not (ax.is_categorical() or ax.is_interval()): warnings.warn(_missing_key_warning, @@ -1339,7 +1339,7 @@ class _IXIndexer(_NDFrameIndexer): .iloc for positional indexing See the documentation here: - http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated""") # noqa + http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#ix-indexer-is-deprecated""") # noqa def __init__(self, name, obj): warnings.warn(self._ix_deprecation_warning, diff --git a/pandas/core/reshape/concat.py b/pandas/core/reshape/concat.py index ee3ed3899a55f..4523a6ad48f19 100644 --- a/pandas/core/reshape/concat.py +++ b/pandas/core/reshape/concat.py @@ -100,7 +100,7 @@ def concat(objs, axis=0, join='outer', join_axes=None, ignore_index=False, A walkthrough of how this method fits in with other tools for combining pandas objects can be found `here - `__. + `__. Examples -------- diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index 817d539d4ad6f..0756bdb3777ec 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -533,7 +533,7 @@ def to_datetime(arg, errors='raise', dayfirst=False, yearfirst=False, dtype: datetime64[ns] If a date does not meet the `timestamp limitations - `_, passing errors='ignore' will return the original input instead of raising any exception. diff --git a/pandas/core/window.py b/pandas/core/window.py index d51e12035c829..f332075380c79 100644 --- a/pandas/core/window.py +++ b/pandas/core/window.py @@ -462,7 +462,7 @@ class Window(_Window): See the notes below for further information. on : str, optional For a DataFrame, column on which to calculate - the rolling window, rather than the index + the rolling window, rather than the index. axis : int or str, default 0 closed : str, default None Make the interval closed on the 'right', 'left', 'both' or @@ -488,7 +488,7 @@ class Window(_Window): changed to the center of the window by setting ``center=True``. To learn more about the offsets & frequency strings, please see `this link - `__. + `__. The recognized win_types are: @@ -2188,7 +2188,7 @@ class EWM(_Rolling): (if adjust is True), and 1-alpha and alpha (if adjust is False). More details can be found at - http://pandas.pydata.org/pandas-docs/stable/computation.html#exponentially-weighted-windows + http://pandas.pydata.org/pandas-docs/stable/user_guide/computation.html#exponentially-weighted-windows Examples -------- diff --git a/pandas/io/json/json.py b/pandas/io/json/json.py index ee9d9e000d7e3..20bed9bff7383 100644 --- a/pandas/io/json/json.py +++ b/pandas/io/json/json.py @@ -330,8 +330,8 @@ def read_json(path_or_buf=None, orient=None, typ='frame', dtype=None, chunksize : integer, default None Return JsonReader object for iteration. - See the `line-delimted json docs - `_ + See the `line-delimited json docs + `_ for more information on ``chunksize``. This can only be passed if `lines=True`. If this is None, the file will be read into memory all at once. diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index c65c11e840c27..bcbdd80865360 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -58,7 +58,7 @@ into chunks. Additional help can be found in the online docs for -`IO Tools `_. +`IO Tools `_. Parameters ---------- @@ -753,7 +753,7 @@ def read_fwf(filepath_or_buffer: FilePathOrBuffer, into chunks. Additional help can be found in the `online docs for IO Tools - `_. + `_. Parameters ---------- diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 11f705e88179d..53ef2395a302a 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -867,8 +867,8 @@ def put(self, key, value, format=None, append=False, **kwargs): This will force Table format, append the input data to the existing. data_columns : list of columns to create as data columns, or True to - use all columns. See - `here `__ # noqa + use all columns. See `here + `__. encoding : default None, provide an encoding for strings dropna : boolean, default False, do not write an ALL nan row to the store settable by the option 'io.hdf.dropna_table' @@ -949,7 +949,7 @@ def append(self, key, value, format=None, append=True, columns=None, List of columns to create as indexed data columns for on-disk queries, or True to use all columns. By default only the axes of the object are indexed. See `here - `__. + `__. min_itemsize : dict of columns that specify minimum string sizes nan_rep : string to use as string nan represenation chunksize : size to chunk the writing From c6a7cc1e08f9203caf57599244cd1c51f6347875 Mon Sep 17 00:00:00 2001 From: h-vetinari <33685575+h-vetinari@users.noreply.github.com> Date: Sat, 1 Jun 2019 02:17:53 +0200 Subject: [PATCH 13/51] TST: Datetime conftest.py improvements (#26596) xref gh-23537 --- pandas/conftest.py | 22 +++++++++++++++------- 1 file changed, 15 insertions(+), 7 deletions(-) diff --git a/pandas/conftest.py b/pandas/conftest.py index 3c411f8ba3e31..8f71028f51ab4 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -376,10 +376,16 @@ def unique_nulls_fixture(request): FixedOffset(0), FixedOffset(-300), timezone.utc, timezone(timedelta(hours=1)), timezone(timedelta(hours=-1), name='foo')] +TIMEZONE_IDS = ['None', 'UTC', 'US/Eastern', 'Asia/Tokyp', + 'dateutil/US/Pacific', 'dateutil/Asia/Singapore', + 'dateutil.tz.tzutz()', 'dateutil.tz.tzlocal()', + 'pytz.FixedOffset(300)', 'pytz.FixedOffset(0)', + 'pytz.FixedOffset(-300)', 'datetime.timezone.utc', + 'datetime.timezone.+1', 'datetime.timezone.-1.named'] -@td.parametrize_fixture_doc(str(TIMEZONES)) -@pytest.fixture(params=TIMEZONES) +@td.parametrize_fixture_doc(str(TIMEZONE_IDS)) +@pytest.fixture(params=TIMEZONES, ids=TIMEZONE_IDS) def tz_naive_fixture(request): """ Fixture for trying timezones including default (None): {0} @@ -387,8 +393,8 @@ def tz_naive_fixture(request): return request.param -@td.parametrize_fixture_doc(str(TIMEZONES[1:])) -@pytest.fixture(params=TIMEZONES[1:]) +@td.parametrize_fixture_doc(str(TIMEZONE_IDS[1:])) +@pytest.fixture(params=TIMEZONES[1:], ids=TIMEZONE_IDS[1:]) def tz_aware_fixture(request): """ Fixture for trying explicit timezones: {0} @@ -398,6 +404,8 @@ def tz_aware_fixture(request): # ---------------------------------------------------------------- # Dtypes +# ---------------------------------------------------------------- + UNSIGNED_INT_DTYPES = ["uint8", "uint16", "uint32", "uint64"] UNSIGNED_EA_INT_DTYPES = ["UInt8", "UInt16", "UInt32", "UInt64"] SIGNED_INT_DTYPES = [int, "int8", "int16", "int32", "int64"] @@ -409,8 +417,8 @@ def tz_aware_fixture(request): COMPLEX_DTYPES = [complex, "complex64", "complex128"] STRING_DTYPES = [str, 'str', 'U'] -DATETIME_DTYPES = ['datetime64[ns]', 'M8[ns]'] -TIMEDELTA_DTYPES = ['timedelta64[ns]', 'm8[ns]'] +DATETIME64_DTYPES = ['datetime64[ns]', 'M8[ns]'] +TIMEDELTA64_DTYPES = ['timedelta64[ns]', 'm8[ns]'] BOOL_DTYPES = [bool, 'bool'] BYTES_DTYPES = [bytes, 'bytes'] @@ -418,7 +426,7 @@ def tz_aware_fixture(request): ALL_REAL_DTYPES = FLOAT_DTYPES + ALL_INT_DTYPES ALL_NUMPY_DTYPES = (ALL_REAL_DTYPES + COMPLEX_DTYPES + STRING_DTYPES + - DATETIME_DTYPES + TIMEDELTA_DTYPES + BOOL_DTYPES + + DATETIME64_DTYPES + TIMEDELTA64_DTYPES + BOOL_DTYPES + OBJECT_DTYPES + BYTES_DTYPES) From 7f3423c5a22122a01e69a1fc5090bb86deb845fb Mon Sep 17 00:00:00 2001 From: Alexander Nordin Date: Sat, 1 Jun 2019 10:04:14 -0400 Subject: [PATCH 14/51] ERR: better error message on too large excel sheet (#26080) --- doc/source/whatsnew/v0.25.0.rst | 1 + pandas/io/formats/excel.py | 10 ++++++++++ pandas/tests/io/test_excel.py | 18 ++++++++++++++++++ 3 files changed, 29 insertions(+) diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index 89a9da4a73b35..ae5b6aafe4c7d 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -533,6 +533,7 @@ I/O - Fixed memory leak in :meth:`DataFrame.to_json` when dealing with numeric data (:issue:`24889`) - Bug in :func:`read_json` where date strings with ``Z`` were not converted to a UTC timezone (:issue:`26168`) - Added ``cache_dates=True`` parameter to :meth:`read_csv`, which allows to cache unique dates when they are parsed (:issue:`25990`) +- :meth:`DataFrame.to_excel` now raises a ``ValueError`` when the caller's dimensions exceed the limitations of Excel (:issue:`26051`) Plotting ^^^^^^^^ diff --git a/pandas/io/formats/excel.py b/pandas/io/formats/excel.py index fd6e3304ec4ef..4db00e34b39e2 100644 --- a/pandas/io/formats/excel.py +++ b/pandas/io/formats/excel.py @@ -341,6 +341,9 @@ class ExcelFormatter: This is only called for body cells. """ + max_rows = 2**20 + max_cols = 2**14 + def __init__(self, df, na_rep='', float_format=None, cols=None, header=True, index=True, index_label=None, merge_cells=False, inf_rep='inf', style_converter=None): @@ -648,6 +651,13 @@ def write(self, writer, sheet_name='Sheet1', startrow=0, from pandas.io.excel import ExcelWriter from pandas.io.common import _stringify_path + num_rows, num_cols = self.df.shape + if num_rows > self.max_rows or num_cols > self.max_cols: + raise ValueError("This sheet is too large! Your sheet size is: " + + "{}, {} ".format(num_rows, num_cols) + + "Max sheet size is: {}, {}". + format(self.max_rows, self.max_cols)) + if isinstance(writer, ExcelWriter): need_save = False else: diff --git a/pandas/tests/io/test_excel.py b/pandas/tests/io/test_excel.py index 1421fc94b67f4..7693caf3b31d2 100644 --- a/pandas/tests/io/test_excel.py +++ b/pandas/tests/io/test_excel.py @@ -1118,6 +1118,24 @@ class and any subclasses, on account of the `autouse=True` class TestExcelWriter(_WriterBase): # Base class for test cases to run with different Excel writers. + def test_excel_sheet_size(self): + + # GH 26080 + breaking_row_count = 2**20 + 1 + breaking_col_count = 2**14 + 1 + # purposely using two arrays to prevent memory issues while testing + row_arr = np.zeros(shape=(breaking_row_count, 1)) + col_arr = np.zeros(shape=(1, breaking_col_count)) + row_df = pd.DataFrame(row_arr) + col_df = pd.DataFrame(col_arr) + + msg = "sheet is too large" + with pytest.raises(ValueError, match=msg): + row_df.to_excel(self.path) + + with pytest.raises(ValueError, match=msg): + col_df.to_excel(self.path) + def test_excel_sheet_by_name_raise(self, *_): import xlrd From 3fbe6270980e1234d94b65103d0085f73c7c7f10 Mon Sep 17 00:00:00 2001 From: Pauli Virtanen Date: Sat, 1 Jun 2019 14:08:20 +0000 Subject: [PATCH 15/51] CLN: remove sample_time attributes from benchmarks (#26598) --- asv_bench/benchmarks/index_object.py | 1 - asv_bench/benchmarks/rolling.py | 6 ------ 2 files changed, 7 deletions(-) diff --git a/asv_bench/benchmarks/index_object.py b/asv_bench/benchmarks/index_object.py index 0fdf46e7c64de..896a20bae2069 100644 --- a/asv_bench/benchmarks/index_object.py +++ b/asv_bench/benchmarks/index_object.py @@ -52,7 +52,6 @@ def time_is_dates_only(self): class Ops: - sample_time = 0.2 params = ['float', 'int'] param_names = ['dtype'] diff --git a/asv_bench/benchmarks/rolling.py b/asv_bench/benchmarks/rolling.py index 2532d326dff4b..033b466c8b9be 100644 --- a/asv_bench/benchmarks/rolling.py +++ b/asv_bench/benchmarks/rolling.py @@ -4,7 +4,6 @@ class Methods: - sample_time = 0.2 params = (['DataFrame', 'Series'], [10, 1000], ['int', 'float'], @@ -23,7 +22,6 @@ def time_rolling(self, constructor, window, dtype, method): class ExpandingMethods: - sample_time = 0.2 params = (['DataFrame', 'Series'], ['int', 'float'], ['median', 'mean', 'max', 'min', 'std', 'count', 'skew', 'kurt', @@ -41,7 +39,6 @@ def time_expanding(self, constructor, dtype, method): class EWMMethods: - sample_time = 0.2 params = (['DataFrame', 'Series'], [10, 1000], ['int', 'float'], @@ -58,7 +55,6 @@ def time_ewm(self, constructor, window, dtype, method): class VariableWindowMethods(Methods): - sample_time = 0.2 params = (['DataFrame', 'Series'], ['50s', '1h', '1d'], ['int', 'float'], @@ -75,7 +71,6 @@ def setup(self, constructor, window, dtype, method): class Pairwise: - sample_time = 0.2 params = ([10, 1000, None], ['corr', 'cov'], [True, False]) @@ -95,7 +90,6 @@ def time_pairwise(self, window, method, pairwise): class Quantile: - sample_time = 0.2 params = (['DataFrame', 'Series'], [10, 1000], ['int', 'float'], From a498a2e609291d4e3007f83d6e82a0b7283ecfb1 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Sat, 1 Jun 2019 15:09:27 +0100 Subject: [PATCH 16/51] TST: add concrete examples of dataframe fixtures to docstrings (#26593) --- pandas/tests/frame/conftest.py | 169 +++++++++++++++++++++++++++++++++ 1 file changed, 169 insertions(+) diff --git a/pandas/tests/frame/conftest.py b/pandas/tests/frame/conftest.py index 27c0e070c10c2..c451cd58f1497 100644 --- a/pandas/tests/frame/conftest.py +++ b/pandas/tests/frame/conftest.py @@ -11,6 +11,25 @@ def float_frame(): Fixture for DataFrame of floats with index of unique strings Columns are ['A', 'B', 'C', 'D']. + + A B C D + P7GACiRnxd -0.465578 -0.361863 0.886172 -0.053465 + qZKh6afn8n -0.466693 -0.373773 0.266873 1.673901 + tkp0r6Qble 0.148691 -0.059051 0.174817 1.598433 + wP70WOCtv8 0.133045 -0.581994 -0.992240 0.261651 + M2AeYQMnCz -1.207959 -0.185775 0.588206 0.563938 + QEPzyGDYDo -0.381843 -0.758281 0.502575 -0.565053 + r78Jwns6dn -0.653707 0.883127 0.682199 0.206159 + ... ... ... ... ... + IHEGx9NO0T -0.277360 0.113021 -1.018314 0.196316 + lPMj8K27FA -1.313667 -0.604776 -1.305618 -0.863999 + qa66YMWQa5 1.110525 0.475310 -0.747865 0.032121 + yOa0ATsmcE -0.431457 0.067094 0.096567 -0.264962 + 65znX3uRNG 1.528446 0.160416 -0.109635 -0.032987 + eCOBvKqf3e 0.235281 1.622222 0.781255 0.392871 + xSucinXxuV -1.263557 0.252799 -0.552247 0.400426 + + [30 rows x 4 columns] """ return DataFrame(tm.getSeriesData()) @@ -21,6 +40,25 @@ def float_frame_with_na(): Fixture for DataFrame of floats with index of unique strings Columns are ['A', 'B', 'C', 'D']; some entries are missing + + A B C D + ABwBzA0ljw -1.128865 -0.897161 0.046603 0.274997 + DJiRzmbyQF 0.728869 0.233502 0.722431 -0.890872 + neMgPD5UBF 0.486072 -1.027393 -0.031553 1.449522 + 0yWA4n8VeX -1.937191 -1.142531 0.805215 -0.462018 + 3slYUbbqU1 0.153260 1.164691 1.489795 -0.545826 + soujjZ0A08 NaN NaN NaN NaN + 7W6NLGsjB9 NaN NaN NaN NaN + ... ... ... ... ... + uhfeaNkCR1 -0.231210 -0.340472 0.244717 -0.901590 + n6p7GYuBIV -0.419052 1.922721 -0.125361 -0.727717 + ZhzAeY6p1y 1.234374 -1.425359 -0.827038 -0.633189 + uWdPsORyUh 0.046738 -0.980445 -1.102965 0.605503 + 3DJA6aN590 -0.091018 -1.684734 -1.100900 0.215947 + 2GBPAzdbMk -2.883405 -1.021071 1.209877 1.633083 + sHadBoyVHw -2.223032 -0.326384 0.258931 0.245517 + + [30 rows x 4 columns] """ df = DataFrame(tm.getSeriesData()) # set some NAs @@ -35,6 +73,25 @@ def bool_frame_with_na(): Fixture for DataFrame of booleans with index of unique strings Columns are ['A', 'B', 'C', 'D']; some entries are missing + + A B C D + zBZxY2IDGd False False False False + IhBWBMWllt False True True True + ctjdvZSR6R True False True True + AVTujptmxb False True False True + G9lrImrSWq False False False True + sFFwdIUfz2 NaN NaN NaN NaN + s15ptEJnRb NaN NaN NaN NaN + ... ... ... ... ... + UW41KkDyZ4 True True False False + l9l6XkOdqV True False False False + X2MeZfzDYA False True False False + xWkIKU7vfX False True False True + QOhL6VmpGU False False False True + 22PwkRJdat False True False False + kfboQ3VeIK True False True False + + [30 rows x 4 columns] """ df = DataFrame(tm.getSeriesData()) > 0 df = df.astype(object) @@ -50,6 +107,25 @@ def int_frame(): Fixture for DataFrame of ints with index of unique strings Columns are ['A', 'B', 'C', 'D'] + + A B C D + vpBeWjM651 1 0 1 0 + 5JyxmrP1En -1 0 0 0 + qEDaoD49U2 -1 1 0 0 + m66TkTfsFe 0 0 0 0 + EHPaNzEUFm -1 0 -1 0 + fpRJCevQhi 2 0 0 0 + OlQvnmfi3Q 0 0 -2 0 + ... .. .. .. .. + uB1FPlz4uP 0 0 0 1 + EcSe6yNzCU 0 0 -1 0 + L50VudaiI8 -1 1 -2 0 + y3bpw4nwIp 0 -1 0 0 + H0RdLLwrCT 1 1 0 0 + rY82K0vMwm 0 0 0 0 + 1OPIUjnkjk 2 0 0 0 + + [30 rows x 4 columns] """ df = DataFrame({k: v.astype(int) for k, v in tm.getSeriesData().items()}) # force these all to int64 to avoid platform testing issues @@ -62,6 +138,25 @@ def datetime_frame(): Fixture for DataFrame of floats with DatetimeIndex Columns are ['A', 'B', 'C', 'D'] + + A B C D + 2000-01-03 -1.122153 0.468535 0.122226 1.693711 + 2000-01-04 0.189378 0.486100 0.007864 -1.216052 + 2000-01-05 0.041401 -0.835752 -0.035279 -0.414357 + 2000-01-06 0.430050 0.894352 0.090719 0.036939 + 2000-01-07 -0.620982 -0.668211 -0.706153 1.466335 + 2000-01-10 -0.752633 0.328434 -0.815325 0.699674 + 2000-01-11 -2.236969 0.615737 -0.829076 -1.196106 + ... ... ... ... ... + 2000-02-03 1.642618 -0.579288 0.046005 1.385249 + 2000-02-04 -0.544873 -1.160962 -0.284071 -1.418351 + 2000-02-07 -2.656149 -0.601387 1.410148 0.444150 + 2000-02-08 -1.201881 -1.289040 0.772992 -1.445300 + 2000-02-09 1.377373 0.398619 1.008453 -0.928207 + 2000-02-10 0.473194 -0.636677 0.984058 0.511519 + 2000-02-11 -0.965556 0.408313 -1.312844 -0.381948 + + [30 rows x 4 columns] """ return DataFrame(tm.getTimeSeriesData()) @@ -72,6 +167,25 @@ def float_string_frame(): Fixture for DataFrame of floats and strings with index of unique strings Columns are ['A', 'B', 'C', 'D', 'foo']. + + A B C D foo + w3orJvq07g -1.594062 -1.084273 -1.252457 0.356460 bar + PeukuVdmz2 0.109855 -0.955086 -0.809485 0.409747 bar + ahp2KvwiM8 -1.533729 -0.142519 -0.154666 1.302623 bar + 3WSJ7BUCGd 2.484964 0.213829 0.034778 -2.327831 bar + khdAmufk0U -0.193480 -0.743518 -0.077987 0.153646 bar + LE2DZiFlrE -0.193566 -1.343194 -0.107321 0.959978 bar + HJXSJhVn7b 0.142590 1.257603 -0.659409 -0.223844 bar + ... ... ... ... ... ... + 9a1Vypttgw -1.316394 1.601354 0.173596 1.213196 bar + h5d1gVFbEy 0.609475 1.106738 -0.155271 0.294630 bar + mK9LsTQG92 1.303613 0.857040 -1.019153 0.369468 bar + oOLksd9gKH 0.558219 -0.134491 -0.289869 -0.951033 bar + 9jgoOjKyHg 0.058270 -0.496110 -0.413212 -0.852659 bar + jZLDHclHAO 0.096298 1.267510 0.549206 -0.005235 bar + lR0nxDp1C2 -2.119350 -0.794384 0.544118 0.145849 bar + + [30 rows x 5 columns] """ df = DataFrame(tm.getSeriesData()) df['foo'] = 'bar' @@ -84,6 +198,25 @@ def mixed_float_frame(): Fixture for DataFrame of different float types with index of unique strings Columns are ['A', 'B', 'C', 'D']. + + A B C D + GI7bbDaEZe -0.237908 -0.246225 -0.468506 0.752993 + KGp9mFepzA -1.140809 -0.644046 -1.225586 0.801588 + VeVYLAb1l2 -1.154013 -1.677615 0.690430 -0.003731 + kmPME4WKhO 0.979578 0.998274 -0.776367 0.897607 + CPyopdXTiz 0.048119 -0.257174 0.836426 0.111266 + 0kJZQndAj0 0.274357 -0.281135 -0.344238 0.834541 + tqdwQsaHG8 -0.979716 -0.519897 0.582031 0.144710 + ... ... ... ... ... + 7FhZTWILQj -2.906357 1.261039 -0.780273 -0.537237 + 4pUDPM4eGq -2.042512 -0.464382 -0.382080 1.132612 + B8dUgUzwTi -1.506637 -0.364435 1.087891 0.297653 + hErlVYjVv9 1.477453 -0.495515 -0.713867 1.438427 + 1BKN3o7YLs 0.127535 -0.349812 -0.881836 0.489827 + 9S4Ekn7zga 1.445518 -2.095149 0.031982 0.373204 + xN1dNn6OV6 1.425017 -0.983995 -0.363281 -0.224502 + + [30 rows x 4 columns] """ df = DataFrame(tm.getSeriesData()) df.A = df.A.astype('float32') @@ -99,6 +232,25 @@ def mixed_int_frame(): Fixture for DataFrame of different int types with index of unique strings Columns are ['A', 'B', 'C', 'D']. + + A B C D + mUrCZ67juP 0 1 2 2 + rw99ACYaKS 0 1 0 0 + 7QsEcpaaVU 0 1 1 1 + xkrimI2pcE 0 1 0 0 + dz01SuzoS8 0 1 255 255 + ccQkqOHX75 -1 1 0 0 + DN0iXaoDLd 0 1 0 0 + ... .. .. ... ... + Dfb141wAaQ 1 1 254 254 + IPD8eQOVu5 0 1 0 0 + CcaKulsCmv 0 1 0 0 + rIBa8gu7E5 0 1 0 0 + RP6peZmh5o 0 1 1 1 + NMb9pipQWQ 0 1 0 0 + PqgbJEzjib 0 1 3 3 + + [30 rows x 4 columns] """ df = DataFrame({k: v.astype(int) for k, v in tm.getSeriesData().items()}) df.A = df.A.astype('int32') @@ -114,6 +266,11 @@ def timezone_frame(): Fixture for DataFrame of date_range Series with different time zones Columns are ['A', 'B', 'C']; some entries are missing + + A B C + 0 2013-01-01 2013-01-01 00:00:00-05:00 2013-01-01 00:00:00+01:00 + 1 2013-01-02 NaT NaT + 2 2013-01-03 2013-01-03 00:00:00-05:00 2013-01-03 00:00:00+01:00 """ df = DataFrame({'A': date_range('20130101', periods=3), 'B': date_range('20130101', periods=3, @@ -131,6 +288,11 @@ def simple_frame(): Fixture for simple 3x3 DataFrame Columns are ['one', 'two', 'three'], index is ['a', 'b', 'c']. + + one two three + a 1.0 2.0 3.0 + b 4.0 5.0 6.0 + c 7.0 8.0 9.0 """ arr = np.array([[1., 2., 3.], [4., 5., 6.], @@ -147,6 +309,13 @@ def frame_of_index_cols(): Columns are ['A', 'B', 'C', 'D', 'E', ('tuple', 'as', 'label')]; 'A' & 'B' contain duplicates (but are jointly unique), the rest are unique. + + A B C D E (tuple, as, label) + 0 foo one a 0.608477 -0.012500 -1.664297 + 1 foo two b -0.633460 0.249614 -0.364411 + 2 foo three c 0.615256 2.154968 -0.834666 + 3 bar one d 0.234246 1.085675 0.718445 + 4 bar two e 0.533841 -0.005702 -3.533912 """ df = DataFrame({'A': ['foo', 'foo', 'foo', 'bar', 'bar'], 'B': ['one', 'two', 'three', 'one', 'two'], From 3628e1897bfcc548e2fccb020cd436a037da4f31 Mon Sep 17 00:00:00 2001 From: Marc Garcia Date: Sat, 1 Jun 2019 15:12:40 +0100 Subject: [PATCH 17/51] CI/DOC: Building documentation with azure (#26591) --- .travis.yml | 4 +-- azure-pipelines.yml | 62 +++++++++++++++++++++++++++++++++++++- ci/deps/travis-36-doc.yaml | 46 ---------------------------- 3 files changed, 63 insertions(+), 49 deletions(-) delete mode 100644 ci/deps/travis-36-doc.yaml diff --git a/.travis.yml b/.travis.yml index ce8817133a477..90dd904e6cb1e 100644 --- a/.travis.yml +++ b/.travis.yml @@ -51,14 +51,14 @@ matrix: # In allow_failures - dist: trusty env: - - JOB="3.6, doc" ENV_FILE="ci/deps/travis-36-doc.yaml" DOC=true + - JOB="3.6, doc" ENV_FILE="environment.yml" DOC=true allow_failures: - dist: trusty env: - JOB="3.6, slow" ENV_FILE="ci/deps/travis-36-slow.yaml" PATTERN="slow" - dist: trusty env: - - JOB="3.6, doc" ENV_FILE="ci/deps/travis-36-doc.yaml" DOC=true + - JOB="3.6, doc" ENV_FILE="environment.yml" DOC=true before_install: - echo "before_install" diff --git a/azure-pipelines.yml b/azure-pipelines.yml index 17eaee5458af8..9f83917024049 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -15,7 +15,7 @@ jobs: name: Windows vmImage: vs2017-win2016 -- job: 'Checks_and_doc' +- job: 'Checks' pool: vmImage: ubuntu-16.04 timeoutInMinutes: 90 @@ -116,3 +116,63 @@ jobs: fi displayName: 'Running benchmarks' condition: true + +- job: 'Docs' + pool: + vmImage: ubuntu-16.04 + timeoutInMinutes: 90 + steps: + - script: | + echo '##vso[task.setvariable variable=CONDA_ENV]pandas-dev' + echo '##vso[task.setvariable variable=ENV_FILE]environment.yml' + displayName: 'Setting environment variables' + + - script: | + export PATH=$HOME/miniconda3/bin:$PATH + sudo apt-get install -y libc6-dev-i386 + ci/setup_env.sh + displayName: 'Setup environment and build pandas' + + - script: | + export PATH=$HOME/miniconda3/bin:$PATH + source activate pandas-dev + doc/make.py + displayName: 'Build documentation' + + - script: | + cd doc/build/html + git init + touch .nojekyll + git add --all . + git config user.email "pandas-dev@python.org" + git config user.name "pandas-docs-bot" + git commit -m "pandas documentation in master" + displayName: 'Create git repo for docs build' + condition : | + and(not(eq(variables['Build.Reason'], 'PullRequest')), + eq(variables['Build.SourceBranch'], 'refs/heads/master')) + + # This task to work requires next steps: + # 1. Got to "Library > Secure files" in the azure-pipelines dashboard: https://dev.azure.com/pandas-dev/pandas/_library?itemType=SecureFiles + # 2. Click on "+ Secure file" + # 3. Upload the private key (the name of the file must match with the specified in "sshKeySecureFile" input below, "pandas_docs_key") + # 4. Click on file name after it is created, tick the box "Authorize for use in all pipelines" and save + # 5. The public key specified in "sshPublicKey" is the pair of the uploaded private key, and needs to be specified as a deploy key of the repo where the docs will be pushed: https://github.com/pandas-dev/pandas-dev.github.io/settings/keys + - task: InstallSSHKey@0 + inputs: + hostName: 'github.com' + sshPublicKey: 'ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAACAQDHmz3l/EdqrgNxEUKkwDUuUcLv91unig03pYFGO/DMIgCmPdMG96zAgfnESd837Rm0wSSqylwSzkRJt5MV/TpFlcVifDLDQmUhqCeO8Z6dLl/oe35UKmyYICVwcvQTAaHNnYRpKC5IUlTh0JEtw9fGlnp1Ta7U1ENBLbKdpywczElhZu+hOQ892zqOj3CwA+U2329/d6cd7YnqIKoFN9DWT3kS5K6JE4IoBfQEVekIOs23bKjNLvPoOmi6CroAhu/K8j+NCWQjge5eJf2x/yTnIIP1PlEcXoHIr8io517posIx3TBup+CN8bNS1PpDW3jyD3ttl1uoBudjOQrobNnJeR6Rn67DRkG6IhSwr3BWj8alwUG5mTdZzwV5Pa9KZFdIiqX7NoDGg+itsR39QCn0thK8lGRNSR8KrWC1PSjecwelKBO7uQ7rnk/rkrZdBWR4oEA8YgNH8tirUw5WfOr5a0AIaJicKxGKNdMxZt+zmC+bS7F4YCOGIm9KHa43RrKhoGRhRf9fHHHKUPwFGqtWG4ykcUgoamDOURJyepesBAO3FiRE9rLU6ILbB3yEqqoekborHmAJD5vf7PWItW3Q/YQKuk3kkqRcKnexPyzyyq5lUgTi8CxxZdaASIOu294wjBhhdyHlXEkVTNJ9JKkj/obF+XiIIp0cBDsOXY9hDQ== pandas-dev@python.org' + sshKeySecureFile: 'pandas_docs_key' + displayName: 'Install GitHub ssh deployment key' + condition : | + and(not(eq(variables['Build.Reason'], 'PullRequest')), + eq(variables['Build.SourceBranch'], 'refs/heads/master')) + + - script: | + cd doc/build/html + git remote add origin git@github.com:pandas-dev/pandas-dev.github.io.git + git push origin master -f + displayName: 'Publish docs to GitHub pages' + condition : | + and(not(eq(variables['Build.Reason'], 'PullRequest')), + eq(variables['Build.SourceBranch'], 'refs/heads/master')) diff --git a/ci/deps/travis-36-doc.yaml b/ci/deps/travis-36-doc.yaml deleted file mode 100644 index 9d6cbd82fdc05..0000000000000 --- a/ci/deps/travis-36-doc.yaml +++ /dev/null @@ -1,46 +0,0 @@ -name: pandas-dev -channels: - - defaults - - conda-forge -dependencies: - - beautifulsoup4 - - bottleneck - - cython>=0.28.2 - - fastparquet>=0.2.1 - - gitpython - - html5lib - - hypothesis>=3.58.0 - - ipykernel - - ipython - - ipywidgets - - lxml - - matplotlib - - nbconvert>=5.4.1 - - nbformat - - nbsphinx - - notebook>=5.7.5 - - numexpr - - numpy - - numpydoc - - openpyxl - - pandoc - - pyarrow - - pyqt - - pytables - - python-dateutil - - python-snappy - - python=3.6.* - - pytz - - scipy - - seaborn - - sphinx - - sqlalchemy - - statsmodels - - xarray - - xlrd - - xlsxwriter - - xlwt - # universal - - pytest>=4.0.2 - - pytest-xdist - - isort From 6904c230e29a40a110182fd42db8aaee2701c83b Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Sat, 1 Jun 2019 09:35:25 -0500 Subject: [PATCH 18/51] DOC: sparse doc fixups (#26571) --- doc/source/user_guide/sparse.rst | 2 +- doc/source/whatsnew/v0.16.0.rst | 2 ++ doc/source/whatsnew/v0.18.1.rst | 2 ++ doc/source/whatsnew/v0.19.0.rst | 2 ++ doc/source/whatsnew/v0.20.0.rst | 1 + pandas/core/sparse/frame.py | 2 +- pandas/core/sparse/series.py | 2 +- 7 files changed, 10 insertions(+), 3 deletions(-) diff --git a/doc/source/user_guide/sparse.rst b/doc/source/user_guide/sparse.rst index 8fed29d7a6316..09ed895a847ff 100644 --- a/doc/source/user_guide/sparse.rst +++ b/doc/source/user_guide/sparse.rst @@ -269,7 +269,7 @@ have no replacement. Interaction with scipy.sparse ----------------------------- -Use :meth:`DataFrame.sparse.from_coo` to create a ``DataFrame`` with sparse values from a sparse matrix. +Use :meth:`DataFrame.sparse.from_spmatrix` to create a ``DataFrame`` with sparse values from a sparse matrix. .. versionadded:: 0.25.0 diff --git a/doc/source/whatsnew/v0.16.0.rst b/doc/source/whatsnew/v0.16.0.rst index 1e4ec682f0504..2cb09325c9466 100644 --- a/doc/source/whatsnew/v0.16.0.rst +++ b/doc/source/whatsnew/v0.16.0.rst @@ -92,6 +92,7 @@ Interaction with scipy.sparse Added :meth:`SparseSeries.to_coo` and :meth:`SparseSeries.from_coo` methods (:issue:`8048`) for converting to and from ``scipy.sparse.coo_matrix`` instances (see :ref:`here `). For example, given a SparseSeries with MultiIndex we can convert to a `scipy.sparse.coo_matrix` by specifying the row and column labels as index levels: .. ipython:: python + :okwarning: s = pd.Series([3.0, np.nan, 1.0, 3.0, np.nan, np.nan]) s.index = pd.MultiIndex.from_tuples([(1, 2, 'a', 0), @@ -121,6 +122,7 @@ The from_coo method is a convenience method for creating a ``SparseSeries`` from a ``scipy.sparse.coo_matrix``: .. ipython:: python + :okwarning: from scipy import sparse A = sparse.coo_matrix(([3.0, 1.0, 2.0], ([1, 0, 0], [0, 2, 3])), diff --git a/doc/source/whatsnew/v0.18.1.rst b/doc/source/whatsnew/v0.18.1.rst index f099ccf284bc2..069395c2e0f36 100644 --- a/doc/source/whatsnew/v0.18.1.rst +++ b/doc/source/whatsnew/v0.18.1.rst @@ -394,6 +394,7 @@ used in the ``pandas`` implementation (:issue:`12644`, :issue:`12638`, :issue:`1 An example of this signature augmentation is illustrated below: .. ipython:: python + :okwarning: sp = pd.SparseDataFrame([1, 2, 3]) sp @@ -409,6 +410,7 @@ Previous behaviour: New behaviour: .. ipython:: python + :okwarning: np.cumsum(sp, axis=0) diff --git a/doc/source/whatsnew/v0.19.0.rst b/doc/source/whatsnew/v0.19.0.rst index 29eeb415e2f6d..de29a1eb93709 100644 --- a/doc/source/whatsnew/v0.19.0.rst +++ b/doc/source/whatsnew/v0.19.0.rst @@ -1236,6 +1236,7 @@ Operators now preserve dtypes - Sparse data structure now can preserve ``dtype`` after arithmetic ops (:issue:`13848`) .. ipython:: python + :okwarning: s = pd.SparseSeries([0, 2, 0, 1], fill_value=0, dtype=np.int64) s.dtype @@ -1245,6 +1246,7 @@ Operators now preserve dtypes - Sparse data structure now support ``astype`` to convert internal ``dtype`` (:issue:`13900`) .. ipython:: python + :okwarning: s = pd.SparseSeries([1., 0., 2., 0.], fill_value=0) s diff --git a/doc/source/whatsnew/v0.20.0.rst b/doc/source/whatsnew/v0.20.0.rst index 741aa6ca143bb..6a88a5810eca4 100644 --- a/doc/source/whatsnew/v0.20.0.rst +++ b/doc/source/whatsnew/v0.20.0.rst @@ -339,6 +339,7 @@ See the :ref:`documentation ` for more information. (:issue: All sparse formats are supported, but matrices that are not in :mod:`COOrdinate ` format will be converted, copying data as needed. .. ipython:: python + :okwarning: from scipy.sparse import csr_matrix arr = np.random.random(size=(1000, 5)) diff --git a/pandas/core/sparse/frame.py b/pandas/core/sparse/frame.py index fa3cd781eaf88..bf1cec7571f4d 100644 --- a/pandas/core/sparse/frame.py +++ b/pandas/core/sparse/frame.py @@ -42,7 +42,7 @@ class SparseDataFrame(DataFrame): DataFrame containing sparse floating point data in the form of SparseSeries objects - .. deprectaed:: 0.25.0 + .. deprecated:: 0.25.0 Use a DataFrame with sparse values instead. diff --git a/pandas/core/sparse/series.py b/pandas/core/sparse/series.py index e4f8579a398dd..3f95acdbfb42c 100644 --- a/pandas/core/sparse/series.py +++ b/pandas/core/sparse/series.py @@ -46,7 +46,7 @@ class SparseSeries(Series): """Data structure for labeled, sparse floating point data - .. deprectaed:: 0.25.0 + .. deprecated:: 0.25.0 Use a Series with sparse values instead. From 2630a0b83b61b783da336bcc3823e5d7bd302488 Mon Sep 17 00:00:00 2001 From: nathalier Date: Sat, 1 Jun 2019 15:45:06 +0100 Subject: [PATCH 19/51] BUG: ignore errors for invalid dates in to_datetime() with errors=coerce (#25512) (#26561) --- doc/source/whatsnew/v0.25.0.rst | 1 + pandas/core/tools/datetimes.py | 6 +++--- pandas/tests/indexes/datetimes/test_tools.py | 19 +++++++++++++++++++ 3 files changed, 23 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index ae5b6aafe4c7d..a62cac7a94bbd 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -427,6 +427,7 @@ Datetimelike - Bug in :class:`Series` and :class:`DataFrame` repr where ``np.datetime64('NaT')`` and ``np.timedelta64('NaT')`` with ``dtype=object`` would be represented as ``NaN`` (:issue:`25445`) - Bug in :func:`to_datetime` which does not replace the invalid argument with ``NaT`` when error is set to coerce (:issue:`26122`) - Bug in adding :class:`DateOffset` with nonzero month to :class:`DatetimeIndex` would raise ``ValueError`` (:issue:`26258`) +- Bug in :func:`to_datetime` which raises unhandled ``OverflowError`` when called with mix of invalid dates and ``NaN`` values with ``format='%Y%m%d'`` and ``error='coerce'`` (:issue:`25512`) Timedelta ^^^^^^^^^ diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index 0756bdb3777ec..73119671550a5 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -775,21 +775,21 @@ def calc_with_mask(carg, mask): # try intlike / strings that are ints try: return calc(arg.astype(np.int64)) - except ValueError: + except (ValueError, OverflowError): pass # a float with actual np.nan try: carg = arg.astype(np.float64) return calc_with_mask(carg, notna(carg)) - except ValueError: + except (ValueError, OverflowError): pass # string with NaN-like try: mask = ~algorithms.isin(arg, list(tslib.nat_strings)) return calc_with_mask(arg, mask) - except ValueError: + except (ValueError, OverflowError): pass return None diff --git a/pandas/tests/indexes/datetimes/test_tools.py b/pandas/tests/indexes/datetimes/test_tools.py index d62d8d1276fec..c507c31ee54dd 100644 --- a/pandas/tests/indexes/datetimes/test_tools.py +++ b/pandas/tests/indexes/datetimes/test_tools.py @@ -96,6 +96,25 @@ def test_to_datetime_format_YYYYMMDD(self, cache): result = pd.to_datetime(s, format='%Y%m%d', errors='coerce', cache=cache) expected = Series(['20121231', '20141231', 'NaT'], dtype='M8[ns]') + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize("input_s, expected", [ + # NaN before strings with invalid date values + [Series(['19801222', np.nan, '20010012', '10019999']), + Series([Timestamp('19801222'), np.nan, np.nan, np.nan])], + # NaN after strings with invalid date values + [Series(['19801222', '20010012', '10019999', np.nan]), + Series([Timestamp('19801222'), np.nan, np.nan, np.nan])], + # NaN before integers with invalid date values + [Series([20190813, np.nan, 20010012, 20019999]), + Series([Timestamp('20190813'), np.nan, np.nan, np.nan])], + # NaN after integers with invalid date values + [Series([20190813, 20010012, np.nan, 20019999]), + Series([Timestamp('20190813'), np.nan, np.nan, np.nan])]]) + def test_to_datetime_format_YYYYMMDD_overflow(self, input_s, expected): + # GH 25512 + # format='%Y%m%d', errors='coerce' + result = pd.to_datetime(input_s, format='%Y%m%d', errors='coerce') assert_series_equal(result, expected) @pytest.mark.parametrize('cache', [True, False]) From 4ec92eb45ac5ae2f7bc97e53a36294dab548e0c4 Mon Sep 17 00:00:00 2001 From: Mak Sze Chun Date: Sat, 1 Jun 2019 22:48:37 +0800 Subject: [PATCH 20/51] TST/CLN: Fixturize tests/frame/test_quantile.py (#26556) --- pandas/tests/frame/test_quantile.py | 56 +++++++++++++++++------------ 1 file changed, 33 insertions(+), 23 deletions(-) diff --git a/pandas/tests/frame/test_quantile.py b/pandas/tests/frame/test_quantile.py index a5771839e0997..9ccbd290923ba 100644 --- a/pandas/tests/frame/test_quantile.py +++ b/pandas/tests/frame/test_quantile.py @@ -3,24 +3,24 @@ import pandas as pd from pandas import DataFrame, Series, Timestamp -from pandas.tests.frame.common import TestData import pandas.util.testing as tm from pandas.util.testing import assert_frame_equal, assert_series_equal -class TestDataFrameQuantile(TestData): +class TestDataFrameQuantile: - def test_quantile(self): + def test_quantile(self, datetime_frame): from numpy import percentile - q = self.tsframe.quantile(0.1, axis=0) - assert q['A'] == percentile(self.tsframe['A'], 10) - tm.assert_index_equal(q.index, self.tsframe.columns) + df = datetime_frame + q = df.quantile(0.1, axis=0) + assert q['A'] == percentile(df['A'], 10) + tm.assert_index_equal(q.index, df.columns) - q = self.tsframe.quantile(0.9, axis=1) + q = df.quantile(0.9, axis=1) assert (q['2000-01-17'] == - percentile(self.tsframe.loc['2000-01-17'], 90)) - tm.assert_index_equal(q.index, self.tsframe.index) + percentile(df.loc['2000-01-17'], 90)) + tm.assert_index_equal(q.index, df.index) # test degenerate case q = DataFrame({'x': [], 'y': []}).quantile(0.1, axis=0) @@ -99,18 +99,6 @@ def test_quantile_axis_parameter(self): def test_quantile_interpolation(self): # see gh-10174 - from numpy import percentile - - # interpolation = linear (default case) - q = self.tsframe.quantile(0.1, axis=0, interpolation='linear') - assert q['A'] == percentile(self.tsframe['A'], 10) - q = self.intframe.quantile(0.1) - assert q['A'] == percentile(self.intframe['A'], 10) - - # test with and without interpolation keyword - q1 = self.intframe.quantile(0.1) - assert q1['A'] == np.percentile(self.intframe['A'], 10) - tm.assert_series_equal(q, q1) # interpolation method other than default linear df = DataFrame({"A": [1, 2, 3], "B": [2, 3, 4]}, index=[1, 2, 3]) @@ -155,6 +143,28 @@ def test_quantile_interpolation(self): index=[.25, .5], columns=['a', 'b', 'c']) assert_frame_equal(result, expected) + def test_quantile_interpolation_datetime(self, datetime_frame): + # see gh-10174 + + # interpolation = linear (default case) + df = datetime_frame + q = df.quantile(0.1, axis=0, interpolation='linear') + assert q['A'] == np.percentile(df['A'], 10) + + def test_quantile_interpolation_int(self, int_frame): + # see gh-10174 + + df = int_frame + # interpolation = linear (default case) + q = df.quantile(0.1) + assert q['A'] == np.percentile(df['A'], 10) + + # test with and without interpolation keyword + # TODO: q1 is not different from q + q1 = df.quantile(0.1) + assert q1['A'] == np.percentile(df['A'], 10) + tm.assert_series_equal(q, q1) + def test_quantile_multi(self): df = DataFrame([[1, 1, 1], [2, 2, 2], [3, 3, 3]], columns=['a', 'b', 'c']) @@ -214,11 +224,11 @@ def test_quantile_datetime(self): # result = df[['a', 'c']].quantile(.5) # result = df[['a', 'c']].quantile([.5]) - def test_quantile_invalid(self): + def test_quantile_invalid(self, datetime_frame): msg = 'percentiles should all be in the interval \\[0, 1\\]' for invalid in [-1, 2, [0.5, -1], [0.5, 2]]: with pytest.raises(ValueError, match=msg): - self.tsframe.quantile(invalid) + datetime_frame.quantile(invalid) def test_quantile_box(self): df = DataFrame({'A': [pd.Timestamp('2011-01-01'), From 9ebbe1bc8f054fcb128181b95a7d22d33da78f36 Mon Sep 17 00:00:00 2001 From: Big Head Date: Sat, 1 Jun 2019 10:51:27 -0400 Subject: [PATCH 21/51] BUG: fix categorical comparison with missing values (#26504 ) (#26514) --- doc/source/whatsnew/v0.25.0.rst | 2 +- pandas/core/arrays/categorical.py | 13 +++++--- .../arrays/categorical/test_operators.py | 32 ++++++++++++++++++- 3 files changed, 41 insertions(+), 6 deletions(-) diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index a62cac7a94bbd..61182b9fa32f2 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -414,7 +414,7 @@ Categorical ^^^^^^^^^^^ - Bug in :func:`DataFrame.at` and :func:`Series.at` that would raise exception if the index was a :class:`CategoricalIndex` (:issue:`20629`) -- +- Fixed bug in comparison of ordered :class:`Categorical` that contained missing values with a scalar which sometimes incorrectly resulted in True (:issue:`26504`) - Datetimelike diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 89b86c66d7b05..44bb44457bc25 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -89,18 +89,23 @@ def f(self, other): else: other_codes = other._codes - na_mask = (self._codes == -1) | (other_codes == -1) + mask = (self._codes == -1) | (other_codes == -1) f = getattr(self._codes, op) ret = f(other_codes) - if na_mask.any(): + if mask.any(): # In other series, the leads to False, so do that here too - ret[na_mask] = False + ret[mask] = False return ret if is_scalar(other): if other in self.categories: i = self.categories.get_loc(other) - return getattr(self._codes, op)(i) + ret = getattr(self._codes, op)(i) + + # check for NaN in self + mask = (self._codes == -1) + ret[mask] = False + return ret else: if op == '__eq__': return np.repeat(False, len(self)) diff --git a/pandas/tests/arrays/categorical/test_operators.py b/pandas/tests/arrays/categorical/test_operators.py index dc6e1a5bc36b3..a443408bf9479 100644 --- a/pandas/tests/arrays/categorical/test_operators.py +++ b/pandas/tests/arrays/categorical/test_operators.py @@ -1,4 +1,5 @@ import operator +import warnings import numpy as np import pytest @@ -17,7 +18,6 @@ def test_categories_none_comparisons(self): tm.assert_categorical_equal(factor, self.factor) def test_comparisons(self): - result = self.factor[self.factor == 'a'] expected = self.factor[np.asarray(self.factor) == 'a'] tm.assert_categorical_equal(result, expected) @@ -186,6 +186,36 @@ def test_comparison_with_unknown_scalars(self): tm.assert_numpy_array_equal(cat != 4, np.array([True, True, True])) + def test_comparison_of_ordered_categorical_with_nan_to_scalar( + self, compare_operators_no_eq_ne): + # https://github.com/pandas-dev/pandas/issues/26504 + # BUG: fix ordered categorical comparison with missing values (#26504 ) + # and following comparisons with scalars in categories with missing + # values should be evaluated as False + + cat = Categorical([1, 2, 3, None], categories=[1, 2, 3], ordered=True) + scalar = 2 + with warnings.catch_warnings(): + warnings.simplefilter("ignore", RuntimeWarning) + expected = getattr(np.array(cat), + compare_operators_no_eq_ne)(scalar) + actual = getattr(cat, compare_operators_no_eq_ne)(scalar) + tm.assert_numpy_array_equal(actual, expected) + + def test_comparison_of_ordered_categorical_with_nan_to_listlike( + self, compare_operators_no_eq_ne): + # https://github.com/pandas-dev/pandas/issues/26504 + # and following comparisons of missing values in ordered Categorical + # with listlike should be evaluated as False + + cat = Categorical([1, 2, 3, None], categories=[1, 2, 3], ordered=True) + other = Categorical([2, 2, 2, 2], categories=[1, 2, 3], ordered=True) + with warnings.catch_warnings(): + warnings.simplefilter("ignore", RuntimeWarning) + expected = getattr(np.array(cat), compare_operators_no_eq_ne)(2) + actual = getattr(cat, compare_operators_no_eq_ne)(other) + tm.assert_numpy_array_equal(actual, expected) + @pytest.mark.parametrize('data,reverse,base', [ (list("abc"), list("cba"), list("bbb")), ([1, 2, 3], [3, 2, 1], [2, 2, 2])] From 2c6d005d073ad48a4a2795f1965e660df3fbfa8c Mon Sep 17 00:00:00 2001 From: enisnazif Date: Sat, 1 Jun 2019 15:52:35 +0100 Subject: [PATCH 22/51] Fix the output of df.describe on an empty categorical / object column (#26474) --- doc/source/whatsnew/v0.25.0.rst | 28 ++++++++++++++++++++++++++++ pandas/core/arrays/categorical.py | 2 +- pandas/core/generic.py | 6 ++++++ pandas/tests/frame/test_analytics.py | 11 +++++++++++ 4 files changed, 46 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index 61182b9fa32f2..ebca80025b9f7 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -253,6 +253,34 @@ are returned. (:issue:`21521`) df.groupby("a").ffill() +``DataFrame`` describe on an empty categorical / object column will return top and freq +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +When calling :meth:`DataFrame.describe` with an empty categorical / object +column, the 'top' and 'freq' columns were previously omitted, which was inconsistent with +the output for non-empty columns. Now the 'top' and 'freq' columns will always be included, +with :attr:`numpy.nan` in the case of an empty :class:`DataFrame` (:issue:`26397`) + +.. ipython:: python + + df = pd.DataFrame({"empty_col": pd.Categorical([])}) + df + +*Previous Behavior*: + +.. code-block:: python + + In [3]: df.describe() + Out[3]: + empty_col + count 0 + unique 0 + +*New Behavior*: + +.. ipython:: python + + df.describe() ``__str__`` methods now call ``__repr__`` rather than vica-versa ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 44bb44457bc25..49dd0041854bc 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -1483,7 +1483,7 @@ def value_counts(self, dropna=True): if dropna or clean: obs = code if clean else code[mask] - count = bincount(obs, minlength=ncat or None) + count = bincount(obs, minlength=ncat or 0) else: count = bincount(np.where(mask, code, ncat)) ix = np.append(ix, -1) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 0596d0ab844ec..7ca2c52e18c41 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -9920,6 +9920,12 @@ def describe_categorical_1d(data): names += ['top', 'freq'] result += [top, freq] + # If the DataFrame is empty, set 'top' and 'freq' to None + # to maintain output shape consistency + else: + names += ['top', 'freq'] + result += [None, None] + return pd.Series(result, index=names, name=data.name) def describe_1d(data): diff --git a/pandas/tests/frame/test_analytics.py b/pandas/tests/frame/test_analytics.py index effe7eb47323d..487ff7932ec5f 100644 --- a/pandas/tests/frame/test_analytics.py +++ b/pandas/tests/frame/test_analytics.py @@ -588,6 +588,16 @@ def test_describe_categorical(self): result = df3.describe() tm.assert_numpy_array_equal(result["cat"].values, result["s"].values) + def test_describe_empty_categorical_column(self): + # GH 26397 + # Ensure the index of an an empty categoric DataFrame column + # also contains (count, unique, top, freq) + df = pd.DataFrame({"empty_col": Categorical([])}) + result = df.describe() + expected = DataFrame({'empty_col': [0, 0, None, None]}, + index=['count', 'unique', 'top', 'freq']) + tm.assert_frame_equal(result, expected) + def test_describe_categorical_columns(self): # GH 11558 columns = pd.CategoricalIndex(['int1', 'int2', 'obj'], @@ -608,6 +618,7 @@ def test_describe_categorical_columns(self): index=['count', 'mean', 'std', 'min', '25%', '50%', '75%', 'max'], columns=exp_columns) + tm.assert_frame_equal(result, expected) tm.assert_categorical_equal(result.columns.values, expected.columns.values) From 73d8f96bac5bb0bc58eb6f69d47ea4329b07c6ae Mon Sep 17 00:00:00 2001 From: Jiang Yue <35633013+jiangyue12392@users.noreply.github.com> Date: Sat, 1 Jun 2019 22:56:34 +0800 Subject: [PATCH 23/51] BUG: MultiIndex not dropping nan level and invalid code value (#26408) --- doc/source/whatsnew/v0.25.0.rst | 37 ++++++++++- pandas/core/indexes/multi.py | 62 ++++++++++++++++--- .../tests/indexes/multi/test_constructor.py | 41 +++++++++++- pandas/tests/indexes/multi/test_missing.py | 15 +++++ 4 files changed, 143 insertions(+), 12 deletions(-) diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index ebca80025b9f7..3275223b159f8 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -119,6 +119,42 @@ is respected in indexing. (:issue:`24076`, :issue:`16785`) df['2019-01-01 12:00:00+04:00':'2019-01-01 13:00:00+04:00'] + +.. _whatsnew_0250.api_breaking.multi_indexing: + + +MultiIndex constructed from levels and codes +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Constructing a :class:`MultiIndex` with NaN levels or codes value < -1 was allowed previously. +Now, construction with codes value < -1 is not allowed and NaN levels' corresponding codes +would be reassigned as -1. (:issue:`19387`) + +.. ipython:: python + + mi1 = pd.MultiIndex(levels=[[np.nan, None, pd.NaT, 128, 2]], + codes=[[0, -1, 1, 2, 3, 4]]) + mi2 = pd.MultiIndex(levels=[[1, 2]], codes=[[0, -2]]) + +*Previous Behavior*: + +.. code-block:: ipython + + In [1]: mi1 + Out[1]: MultiIndex(levels=[[nan, None, NaT, 128, 2]], + codes=[[0, -1, 1, 2, 3, 4]]) + In [2]: mi2 + Out[2]: MultiIndex(levels=[[1, 2]], + codes=[[0, -2]]) + +*New Behavior*: + +.. ipython:: python + + mi1 + mi2 + + .. _whatsnew_0250.api_breaking.groupby_apply_first_group_once: GroupBy.apply on ``DataFrame`` evaluates first group only once @@ -536,7 +572,6 @@ MultiIndex - Bug in which incorrect exception raised by :class:`Timedelta` when testing the membership of :class:`MultiIndex` (:issue:`24570`) - -- I/O ^^^ diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index ec2cc70d1a352..9217b388ce86b 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -243,11 +243,35 @@ def __new__(cls, levels=None, codes=None, sortorder=None, names=None, result.sortorder = sortorder if verify_integrity: - result._verify_integrity() + new_codes = result._verify_integrity() + result._codes = new_codes + if _set_identity: result._reset_identity() + return result + def _validate_codes(self, level: list, code: list): + """ + Reassign code values as -1 if their corresponding levels are NaN. + + Parameters + ---------- + code : list + Code to reassign. + level : list + Level to check for missing values (NaN, NaT, None). + + Returns + ------- + code : new code where code value = -1 if it corresponds + to a level with missing values (NaN, NaT, None). + """ + null_mask = isna(level) + if np.any(null_mask): + code = np.where(null_mask[code], -1, code) + return code + def _verify_integrity(self, codes=None, levels=None): """ @@ -263,6 +287,11 @@ def _verify_integrity(self, codes=None, levels=None): ValueError If length of levels and codes don't match, if the codes for any level would exceed level bounds, or there are any duplicate levels. + + Returns + ------- + codes : new codes where code value = -1 if it corresponds to a + NaN level. """ # NOTE: Currently does not check, among other things, that cached # nlevels matches nor that sortorder matches actually sortorder. @@ -272,22 +301,33 @@ def _verify_integrity(self, codes=None, levels=None): if len(levels) != len(codes): raise ValueError("Length of levels and codes must match. NOTE:" " this index is in an inconsistent state.") - codes_length = len(self.codes[0]) + codes_length = len(codes[0]) for i, (level, level_codes) in enumerate(zip(levels, codes)): if len(level_codes) != codes_length: raise ValueError("Unequal code lengths: %s" % ([len(code_) for code_ in codes])) if len(level_codes) and level_codes.max() >= len(level): - raise ValueError("On level %d, code max (%d) >= length of" - " level (%d). NOTE: this index is in an" - " inconsistent state" % (i, level_codes.max(), - len(level))) + msg = ("On level {level}, code max ({max_code}) >= length of " + "level ({level_len}). NOTE: this index is in an " + "inconsistent state".format( + level=i, max_code=level_codes.max(), + level_len=len(level))) + raise ValueError(msg) + if len(level_codes) and level_codes.min() < -1: + raise ValueError("On level {level}, code value ({code})" + " < -1".format( + level=i, code=level_codes.min())) if not level.is_unique: raise ValueError("Level values must be unique: {values} on " "level {level}".format( values=[value for value in level], level=i)) + codes = [self._validate_codes(level, code) + for level, code in zip(levels, codes)] + new_codes = FrozenList(codes) + return new_codes + @classmethod def from_arrays(cls, arrays, sortorder=None, names=None): """ @@ -586,7 +626,8 @@ def _set_levels(self, levels, level=None, copy=False, validate=True, new_levels = FrozenList(new_levels) if verify_integrity: - self._verify_integrity(levels=new_levels) + new_codes = self._verify_integrity(levels=new_levels) + self._codes = new_codes names = self.names self._levels = new_levels @@ -676,7 +717,6 @@ def labels(self): def _set_codes(self, codes, level=None, copy=False, validate=True, verify_integrity=False): - if validate and level is None and len(codes) != self.nlevels: raise ValueError("Length of codes must match number of levels") if validate and level is not None and len(codes) != len(level): @@ -696,9 +736,10 @@ def _set_codes(self, codes, level=None, copy=False, validate=True, new_codes = FrozenList(new_codes) if verify_integrity: - self._verify_integrity(codes=new_codes) + new_codes = self._verify_integrity(codes=new_codes) self._codes = new_codes + self._tuples = None self._reset_cache() @@ -1763,9 +1804,10 @@ def __setstate__(self, state): self._set_levels([Index(x) for x in levels], validate=False) self._set_codes(codes) + new_codes = self._verify_integrity() + self._set_codes(new_codes) self._set_names(names) self.sortorder = sortorder - self._verify_integrity() self._reset_identity() def __getitem__(self, key): diff --git a/pandas/tests/indexes/multi/test_constructor.py b/pandas/tests/indexes/multi/test_constructor.py index 37290bc6eb1c0..7cab05660ac49 100644 --- a/pandas/tests/indexes/multi/test_constructor.py +++ b/pandas/tests/indexes/multi/test_constructor.py @@ -63,9 +63,10 @@ def test_constructor_mismatched_codes_levels(idx): with pytest.raises(ValueError, match=msg): MultiIndex(levels=levels, codes=codes) - length_error = (r"On level 0, code max \(3\) >= length of level \(1\)\." + length_error = (r"On level 0, code max \(3\) >= length of level \(1\)\." " NOTE: this index is in an inconsistent state") label_error = r"Unequal code lengths: \[4, 2\]" + code_value_error = r"On level 0, code value \(-2\) < -1" # important to check that it's looking at the right thing. with pytest.raises(ValueError, match=length_error): @@ -82,6 +83,44 @@ def test_constructor_mismatched_codes_levels(idx): with pytest.raises(ValueError, match=label_error): idx.copy().set_codes([[0, 0, 0, 0], [0, 0]]) + # test set_codes with verify_integrity=False + # the setting should not raise any value error + idx.copy().set_codes(codes=[[0, 0, 0, 0], [0, 0]], + verify_integrity=False) + + # code value smaller than -1 + with pytest.raises(ValueError, match=code_value_error): + MultiIndex(levels=[['a'], ['b']], codes=[[0, -2], [0, 0]]) + + +def test_na_levels(): + # GH26408 + # test if codes are re-assigned value -1 for levels + # with mising values (NaN, NaT, None) + result = MultiIndex(levels=[[np.nan, None, pd.NaT, 128, 2]], + codes=[[0, -1, 1, 2, 3, 4]]) + expected = MultiIndex(levels=[[np.nan, None, pd.NaT, 128, 2]], + codes=[[-1, -1, -1, -1, 3, 4]]) + tm.assert_index_equal(result, expected) + + result = MultiIndex(levels=[[np.nan, 's', pd.NaT, 128, None]], + codes=[[0, -1, 1, 2, 3, 4]]) + expected = MultiIndex(levels=[[np.nan, 's', pd.NaT, 128, None]], + codes=[[-1, -1, 1, -1, 3, -1]]) + tm.assert_index_equal(result, expected) + + # verify set_levels and set_codes + result = MultiIndex( + levels=[[1, 2, 3, 4, 5]], codes=[[0, -1, 1, 2, 3, 4]]).set_levels( + [[np.nan, 's', pd.NaT, 128, None]]) + tm.assert_index_equal(result, expected) + + result = MultiIndex( + levels=[[np.nan, 's', pd.NaT, 128, None]], + codes=[[1, 2, 2, 2, 2, 2]]).set_codes( + [[0, -1, 1, 2, 3, 4]]) + tm.assert_index_equal(result, expected) + def test_labels_deprecated(idx): # GH23752 diff --git a/pandas/tests/indexes/multi/test_missing.py b/pandas/tests/indexes/multi/test_missing.py index ed90f74d80989..518c12bb20e13 100644 --- a/pandas/tests/indexes/multi/test_missing.py +++ b/pandas/tests/indexes/multi/test_missing.py @@ -73,6 +73,21 @@ def test_dropna(): with pytest.raises(ValueError, match=msg): idx.dropna(how='xxx') + # GH26408 + # test if missing values are dropped for mutiindex constructed + # from codes and values + idx = MultiIndex(levels=[[np.nan, None, pd.NaT, "128", 2], + [np.nan, None, pd.NaT, "128", 2]], + codes=[[0, -1, 1, 2, 3, 4], + [0, -1, 3, 3, 3, 4]]) + expected = MultiIndex.from_arrays([["128", 2], ["128", 2]]) + tm.assert_index_equal(idx.dropna(), expected) + tm.assert_index_equal(idx.dropna(how='any'), expected) + + expected = MultiIndex.from_arrays([[np.nan, np.nan, "128", 2], + ["128", "128", "128", 2]]) + tm.assert_index_equal(idx.dropna(how='all'), expected) + def test_nulls(idx): # this is really a smoke test for the methods From 9a42cbe85461c28417a5130bc80b035044c5575a Mon Sep 17 00:00:00 2001 From: h-vetinari <33685575+h-vetinari@users.noreply.github.com> Date: Sat, 1 Jun 2019 17:03:06 +0200 Subject: [PATCH 24/51] API: Series.str-accessor infers dtype (and Index.str does not raise on all-NA) (#23167) --- doc/source/user_guide/text.rst | 10 ++ doc/source/whatsnew/v0.25.0.rst | 40 +++++- pandas/core/strings.py | 214 +++++++++++++++++++++++++------- pandas/tests/test_strings.py | 48 +++---- 4 files changed, 233 insertions(+), 79 deletions(-) diff --git a/doc/source/user_guide/text.rst b/doc/source/user_guide/text.rst index f7fdfcf8bf882..87c75e8bcd91f 100644 --- a/doc/source/user_guide/text.rst +++ b/doc/source/user_guide/text.rst @@ -70,6 +70,16 @@ and replacing any remaining whitespaces with underscores: ``.str`` methods which operate on elements of type ``list`` are not available on such a ``Series``. +.. _text.warn_types: + +.. warning:: + + Before v.0.25.0, the ``.str``-accessor did only the most rudimentary type checks. Starting with + v.0.25.0, the type of the Series is inferred and the allowed types (i.e. strings) are enforced more rigorously. + + Generally speaking, the ``.str`` accessor is intended to work only on strings. With very few + exceptions, other uses are not supported, and may be disabled at a later point. + Splitting and Replacing Strings ------------------------------- diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index 3275223b159f8..87a8010998bd0 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -231,6 +231,43 @@ returned if all the columns were dummy encoded, and a :class:`DataFrame` otherwi Providing any ``SparseSeries`` or ``SparseDataFrame`` to :func:`concat` will cause a ``SparseSeries`` or ``SparseDataFrame`` to be returned, as before. +The ``.str``-accessor performs stricter type checks +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Due to the lack of more fine-grained dtypes, :attr:`Series.str` so far only checked whether the data was +of ``object`` dtype. :attr:`Series.str` will now infer the dtype data *within* the Series; in particular, +``'bytes'``-only data will raise an exception (except for :meth:`Series.str.decode`, :meth:`Series.str.get`, +:meth:`Series.str.len`, :meth:`Series.str.slice`), see :issue:`23163`, :issue:`23011`, :issue:`23551`. + +*Previous Behaviour*: + +.. code-block:: python + + In [1]: s = pd.Series(np.array(['a', 'ba', 'cba'], 'S'), dtype=object) + + In [2]: s + Out[2]: + 0 b'a' + 1 b'ba' + 2 b'cba' + dtype: object + + In [3]: s.str.startswith(b'a') + Out[3]: + 0 True + 1 False + 2 False + dtype: bool + +*New Behaviour*: + +.. ipython:: python + :okexcept: + + s = pd.Series(np.array(['a', 'ba', 'cba'], 'S'), dtype=object) + s + s.str.startswith(b'a') + .. _whatsnew_0250.api_breaking.incompatible_index_unions Incompatible Index Type Unions @@ -331,7 +368,6 @@ This change is backward compatible for direct usage of Pandas, but if you subcla Pandas objects *and* give your subclasses specific ``__str__``/``__repr__`` methods, you may have to adjust your ``__str__``/``__repr__`` methods (:issue:`26495`). - .. _whatsnew_0250.api_breaking.deps: Increased minimum versions for dependencies @@ -537,7 +573,7 @@ Conversion Strings ^^^^^^^ -- +- Bug in the ``__name__`` attribute of several methods of :class:`Series.str`, which were set incorrectly (:issue:`23551`) - - diff --git a/pandas/core/strings.py b/pandas/core/strings.py index ee3796241690d..bd756491abd2f 100644 --- a/pandas/core/strings.py +++ b/pandas/core/strings.py @@ -1,4 +1,5 @@ import codecs +from functools import wraps import re import textwrap from typing import Dict @@ -12,8 +13,8 @@ from pandas.core.dtypes.common import ( ensure_object, is_bool_dtype, is_categorical_dtype, is_integer, - is_list_like, is_object_dtype, is_re, is_scalar, is_string_like) -from pandas.core.dtypes.generic import ABCIndexClass, ABCSeries + is_list_like, is_re, is_scalar, is_string_like) +from pandas.core.dtypes.generic import ABCIndexClass, ABCMultiIndex, ABCSeries from pandas.core.dtypes.missing import isna from pandas.core.algorithms import take_1d @@ -1720,12 +1721,78 @@ def str_encode(arr, encoding, errors="strict"): return _na_map(f, arr) -def _noarg_wrapper(f, docstring=None, **kargs): +def forbid_nonstring_types(forbidden, name=None): + """ + Decorator to forbid specific types for a method of StringMethods. + + For calling `.str.{method}` on a Series or Index, it is necessary to first + initialize the :class:`StringMethods` object, and then call the method. + However, different methods allow different input types, and so this can not + be checked during :meth:`StringMethods.__init__`, but must be done on a + per-method basis. This decorator exists to facilitate this process, and + make it explicit which (inferred) types are disallowed by the method. + + :meth:`StringMethods.__init__` allows the *union* of types its different + methods allow (after skipping NaNs; see :meth:`StringMethods._validate`), + namely: ['string', 'empty', 'bytes', 'mixed', 'mixed-integer']. + + The default string types ['string', 'empty'] are allowed for all methods. + For the additional types ['bytes', 'mixed', 'mixed-integer'], each method + then needs to forbid the types it is not intended for. + + Parameters + ---------- + forbidden : list-of-str or None + List of forbidden non-string types, may be one or more of + `['bytes', 'mixed', 'mixed-integer']`. + name : str, default None + Name of the method to use in the error message. By default, this is + None, in which case the name from the method being wrapped will be + copied. However, for working with further wrappers (like _pat_wrapper + and _noarg_wrapper), it is necessary to specify the name. + + Returns + ------- + func : wrapper + The method to which the decorator is applied, with an added check that + enforces the inferred type to not be in the list of forbidden types. + + Raises + ------ + TypeError + If the inferred type of the underlying data is in `forbidden`. + """ + + # deal with None + forbidden = [] if forbidden is None else forbidden + + allowed_types = {'string', 'empty', 'bytes', + 'mixed', 'mixed-integer'} - set(forbidden) + + def _forbid_nonstring_types(func): + func_name = func.__name__ if name is None else name + + @wraps(func) + def wrapper(self, *args, **kwargs): + if self._inferred_dtype not in allowed_types: + msg = ('Cannot use .str.{name} with values of inferred dtype ' + '{inf_type!r}.'.format(name=func_name, + inf_type=self._inferred_dtype)) + raise TypeError(msg) + return func(self, *args, **kwargs) + wrapper.__name__ = func_name + return wrapper + return _forbid_nonstring_types + + +def _noarg_wrapper(f, name=None, docstring=None, forbidden_types=['bytes'], + **kargs): + @forbid_nonstring_types(forbidden_types, name=name) def wrapper(self): result = _na_map(f, self._parent, **kargs) return self._wrap_result(result) - wrapper.__name__ = f.__name__ + wrapper.__name__ = f.__name__ if name is None else name if docstring is not None: wrapper.__doc__ = docstring else: @@ -1734,22 +1801,26 @@ def wrapper(self): return wrapper -def _pat_wrapper(f, flags=False, na=False, **kwargs): +def _pat_wrapper(f, flags=False, na=False, name=None, + forbidden_types=['bytes'], **kwargs): + @forbid_nonstring_types(forbidden_types, name=name) def wrapper1(self, pat): result = f(self._parent, pat) return self._wrap_result(result) + @forbid_nonstring_types(forbidden_types, name=name) def wrapper2(self, pat, flags=0, **kwargs): result = f(self._parent, pat, flags=flags, **kwargs) return self._wrap_result(result) + @forbid_nonstring_types(forbidden_types, name=name) def wrapper3(self, pat, na=np.nan): result = f(self._parent, pat, na=na) return self._wrap_result(result) wrapper = wrapper3 if na else wrapper2 if flags else wrapper1 - wrapper.__name__ = f.__name__ + wrapper.__name__ = f.__name__ if name is None else name if f.__doc__: wrapper.__doc__ = f.__doc__ @@ -1780,7 +1851,7 @@ class StringMethods(NoNewAttributesMixin): """ def __init__(self, data): - self._validate(data) + self._inferred_dtype = self._validate(data) self._is_categorical = is_categorical_dtype(data) # .values.categories works for both Series/Index @@ -1791,38 +1862,44 @@ def __init__(self, data): @staticmethod def _validate(data): - from pandas.core.index import Index - - if (isinstance(data, ABCSeries) and - not ((is_categorical_dtype(data.dtype) and - is_object_dtype(data.values.categories)) or - (is_object_dtype(data.dtype)))): - # it's neither a string series not a categorical series with - # strings inside the categories. - # this really should exclude all series with any non-string values - # (instead of test for object dtype), but that isn't practical for - # performance reasons until we have a str dtype (GH 9343) + """ + Auxiliary function for StringMethods, infers and checks dtype of data. + + This is a "first line of defence" at the creation of the StringMethods- + object (see _make_accessor), and just checks that the dtype is in the + *union* of the allowed types over all string methods below; this + restriction is then refined on a per-method basis using the decorator + @forbid_nonstring_types (more info in the corresponding docstring). + + This really should exclude all series/index with any non-string values, + but that isn't practical for performance reasons until we have a str + dtype (GH 9343 / 13877) + + Parameters + ---------- + data : The content of the Series + + Returns + ------- + dtype : inferred dtype of data + """ + if isinstance(data, ABCMultiIndex): + raise AttributeError('Can only use .str accessor with Index, ' + 'not MultiIndex') + + # see _libs/lib.pyx for list of inferred types + allowed_types = ['string', 'empty', 'bytes', 'mixed', 'mixed-integer'] + + values = getattr(data, 'values', data) # Series / Index + values = getattr(values, 'categories', values) # categorical / normal + + # missing values obfuscate type inference -> skip + inferred_dtype = lib.infer_dtype(values, skipna=True) + + if inferred_dtype not in allowed_types: raise AttributeError("Can only use .str accessor with string " - "values, which use np.object_ dtype in " - "pandas") - elif isinstance(data, Index): - # can't use ABCIndex to exclude non-str - - # see src/inference.pyx which can contain string values - allowed_types = ('string', 'unicode', 'mixed', 'mixed-integer') - if is_categorical_dtype(data.dtype): - inf_type = data.categories.inferred_type - else: - inf_type = data.inferred_type - if inf_type not in allowed_types: - message = ("Can only use .str accessor with string values " - "(i.e. inferred_type is 'string', 'unicode' or " - "'mixed')") - raise AttributeError(message) - if data.nlevels > 1: - message = ("Can only use .str accessor with Index, not " - "MultiIndex") - raise AttributeError(message) + "values!") + return inferred_dtype def __getitem__(self, key): if isinstance(key, slice): @@ -2025,12 +2102,13 @@ def _get_series_list(self, others, ignore_index=False): warnings.warn('list-likes other than Series, Index, or ' 'np.ndarray WITHIN another list-like are ' 'deprecated and will be removed in a future ' - 'version.', FutureWarning, stacklevel=3) + 'version.', FutureWarning, stacklevel=4) return (los, join_warn) elif all(not is_list_like(x) for x in others): return ([Series(others, index=idx)], False) raise TypeError(err_msg) + @forbid_nonstring_types(['bytes', 'mixed', 'mixed-integer']) def cat(self, others=None, sep=None, na_rep=None, join=None): """ Concatenate strings in the Series/Index with given separator. @@ -2211,7 +2289,7 @@ def cat(self, others=None, sep=None, na_rep=None, join=None): "Index/DataFrame in `others`. To enable alignment " "and silence this warning, pass `join='left'|" "'outer'|'inner'|'right'`. The future default will " - "be `join='left'`.", FutureWarning, stacklevel=2) + "be `join='left'`.", FutureWarning, stacklevel=3) # if join is None, _get_series_list already force-aligned indexes join = 'left' if join is None else join @@ -2384,6 +2462,7 @@ def cat(self, others=None, sep=None, na_rep=None, join=None): @Appender(_shared_docs['str_split'] % { 'side': 'beginning', 'method': 'split'}) + @forbid_nonstring_types(['bytes']) def split(self, pat=None, n=-1, expand=False): result = str_split(self._parent, pat, n=n) return self._wrap_result(result, expand=expand) @@ -2391,6 +2470,7 @@ def split(self, pat=None, n=-1, expand=False): @Appender(_shared_docs['str_split'] % { 'side': 'end', 'method': 'rsplit'}) + @forbid_nonstring_types(['bytes']) def rsplit(self, pat=None, n=-1, expand=False): result = str_rsplit(self._parent, pat, n=n) return self._wrap_result(result, expand=expand) @@ -2485,6 +2565,7 @@ def rsplit(self, pat=None, n=-1, expand=False): '`sep`.' }) @deprecate_kwarg(old_arg_name='pat', new_arg_name='sep') + @forbid_nonstring_types(['bytes']) def partition(self, sep=' ', expand=True): f = lambda x: x.partition(sep) result = _na_map(f, self._parent) @@ -2498,6 +2579,7 @@ def partition(self, sep=' ', expand=True): '`sep`.' }) @deprecate_kwarg(old_arg_name='pat', new_arg_name='sep') + @forbid_nonstring_types(['bytes']) def rpartition(self, sep=' ', expand=True): f = lambda x: x.rpartition(sep) result = _na_map(f, self._parent) @@ -2509,33 +2591,39 @@ def get(self, i): return self._wrap_result(result) @copy(str_join) + @forbid_nonstring_types(['bytes']) def join(self, sep): result = str_join(self._parent, sep) return self._wrap_result(result) @copy(str_contains) + @forbid_nonstring_types(['bytes']) def contains(self, pat, case=True, flags=0, na=np.nan, regex=True): result = str_contains(self._parent, pat, case=case, flags=flags, na=na, regex=regex) return self._wrap_result(result, fill_value=na) @copy(str_match) + @forbid_nonstring_types(['bytes']) def match(self, pat, case=True, flags=0, na=np.nan): result = str_match(self._parent, pat, case=case, flags=flags, na=na) return self._wrap_result(result, fill_value=na) @copy(str_replace) + @forbid_nonstring_types(['bytes']) def replace(self, pat, repl, n=-1, case=None, flags=0, regex=True): result = str_replace(self._parent, pat, repl, n=n, case=case, flags=flags, regex=regex) return self._wrap_result(result) @copy(str_repeat) + @forbid_nonstring_types(['bytes']) def repeat(self, repeats): result = str_repeat(self._parent, repeats) return self._wrap_result(result) @copy(str_pad) + @forbid_nonstring_types(['bytes']) def pad(self, width, side='left', fillchar=' '): result = str_pad(self._parent, width, side=side, fillchar=fillchar) return self._wrap_result(result) @@ -2559,17 +2647,21 @@ def pad(self, width, side='left', fillchar=' '): @Appender(_shared_docs['str_pad'] % dict(side='left and right', method='center')) + @forbid_nonstring_types(['bytes']) def center(self, width, fillchar=' '): return self.pad(width, side='both', fillchar=fillchar) @Appender(_shared_docs['str_pad'] % dict(side='right', method='ljust')) + @forbid_nonstring_types(['bytes']) def ljust(self, width, fillchar=' '): return self.pad(width, side='right', fillchar=fillchar) @Appender(_shared_docs['str_pad'] % dict(side='left', method='rjust')) + @forbid_nonstring_types(['bytes']) def rjust(self, width, fillchar=' '): return self.pad(width, side='left', fillchar=fillchar) + @forbid_nonstring_types(['bytes']) def zfill(self, width): """ Pad strings in the Series/Index by prepending '0' characters. @@ -2639,16 +2731,19 @@ def slice(self, start=None, stop=None, step=None): return self._wrap_result(result) @copy(str_slice_replace) + @forbid_nonstring_types(['bytes']) def slice_replace(self, start=None, stop=None, repl=None): result = str_slice_replace(self._parent, start, stop, repl) return self._wrap_result(result) @copy(str_decode) def decode(self, encoding, errors="strict"): + # need to allow bytes here result = str_decode(self._parent, encoding, errors) return self._wrap_result(result) @copy(str_encode) + @forbid_nonstring_types(['bytes']) def encode(self, encoding, errors="strict"): result = str_encode(self._parent, encoding, errors) return self._wrap_result(result) @@ -2718,28 +2813,33 @@ def encode(self, encoding, errors="strict"): @Appender(_shared_docs['str_strip'] % dict(side='left and right sides', method='strip')) + @forbid_nonstring_types(['bytes']) def strip(self, to_strip=None): result = str_strip(self._parent, to_strip, side='both') return self._wrap_result(result) @Appender(_shared_docs['str_strip'] % dict(side='left side', method='lstrip')) + @forbid_nonstring_types(['bytes']) def lstrip(self, to_strip=None): result = str_strip(self._parent, to_strip, side='left') return self._wrap_result(result) @Appender(_shared_docs['str_strip'] % dict(side='right side', method='rstrip')) + @forbid_nonstring_types(['bytes']) def rstrip(self, to_strip=None): result = str_strip(self._parent, to_strip, side='right') return self._wrap_result(result) @copy(str_wrap) + @forbid_nonstring_types(['bytes']) def wrap(self, width, **kwargs): result = str_wrap(self._parent, width, **kwargs) return self._wrap_result(result) @copy(str_get_dummies) + @forbid_nonstring_types(['bytes']) def get_dummies(self, sep='|'): # we need to cast to Series of strings as only that has all # methods available for making the dummies... @@ -2749,20 +2849,23 @@ def get_dummies(self, sep='|'): name=name, expand=True) @copy(str_translate) + @forbid_nonstring_types(['bytes']) def translate(self, table): result = str_translate(self._parent, table) return self._wrap_result(result) - count = _pat_wrapper(str_count, flags=True) - startswith = _pat_wrapper(str_startswith, na=True) - endswith = _pat_wrapper(str_endswith, na=True) - findall = _pat_wrapper(str_findall, flags=True) + count = _pat_wrapper(str_count, flags=True, name='count') + startswith = _pat_wrapper(str_startswith, na=True, name='startswith') + endswith = _pat_wrapper(str_endswith, na=True, name='endswith') + findall = _pat_wrapper(str_findall, flags=True, name='findall') @copy(str_extract) + @forbid_nonstring_types(['bytes']) def extract(self, pat, flags=0, expand=True): return str_extract(self, pat, flags=flags, expand=expand) @copy(str_extractall) + @forbid_nonstring_types(['bytes']) def extractall(self, pat, flags=0): return str_extractall(self._orig, pat, flags=flags) @@ -2792,6 +2895,7 @@ def extractall(self, pat, flags=0): @Appender(_shared_docs['find'] % dict(side='lowest', method='find', also='rfind : Return highest indexes in each strings.')) + @forbid_nonstring_types(['bytes']) def find(self, sub, start=0, end=None): result = str_find(self._parent, sub, start=start, end=end, side='left') return self._wrap_result(result) @@ -2799,11 +2903,13 @@ def find(self, sub, start=0, end=None): @Appender(_shared_docs['find'] % dict(side='highest', method='rfind', also='find : Return lowest indexes in each strings.')) + @forbid_nonstring_types(['bytes']) def rfind(self, sub, start=0, end=None): result = str_find(self._parent, sub, start=start, end=end, side='right') return self._wrap_result(result) + @forbid_nonstring_types(['bytes']) def normalize(self, form): """ Return the Unicode normal form for the strings in the Series/Index. @@ -2851,6 +2957,7 @@ def normalize(self, form): @Appender(_shared_docs['index'] % dict(side='lowest', similar='find', method='index', also='rindex : Return highest indexes in each strings.')) + @forbid_nonstring_types(['bytes']) def index(self, sub, start=0, end=None): result = str_index(self._parent, sub, start=start, end=end, side='left') @@ -2859,6 +2966,7 @@ def index(self, sub, start=0, end=None): @Appender(_shared_docs['index'] % dict(side='highest', similar='rfind', method='rindex', also='index : Return lowest indexes in each strings.')) + @forbid_nonstring_types(['bytes']) def rindex(self, sub, start=0, end=None): result = str_index(self._parent, sub, start=start, end=end, side='right') @@ -2908,7 +3016,8 @@ def rindex(self, sub, start=0, end=None): 5 3.0 dtype: float64 """) - len = _noarg_wrapper(len, docstring=_shared_docs['len'], dtype=int) + len = _noarg_wrapper(len, docstring=_shared_docs['len'], + forbidden_types=None, dtype=int) _shared_docs['casemethods'] = (""" Convert strings in the Series/Index to %(type)s. @@ -2989,21 +3098,27 @@ def rindex(self, sub, start=0, end=None): _doc_args['casefold'] = dict(type='be casefolded', method='casefold', version='\n .. versionadded:: 0.25.0\n') lower = _noarg_wrapper(lambda x: x.lower(), + name='lower', docstring=_shared_docs['casemethods'] % _doc_args['lower']) upper = _noarg_wrapper(lambda x: x.upper(), + name='upper', docstring=_shared_docs['casemethods'] % _doc_args['upper']) title = _noarg_wrapper(lambda x: x.title(), + name='title', docstring=_shared_docs['casemethods'] % _doc_args['title']) capitalize = _noarg_wrapper(lambda x: x.capitalize(), + name='capitalize', docstring=_shared_docs['casemethods'] % _doc_args['capitalize']) swapcase = _noarg_wrapper(lambda x: x.swapcase(), + name='swapcase', docstring=_shared_docs['casemethods'] % _doc_args['swapcase']) casefold = _noarg_wrapper(lambda x: x.casefold(), + name='casefold', docstring=_shared_docs['casemethods'] % _doc_args['casefold']) @@ -3157,30 +3272,39 @@ def rindex(self, sub, start=0, end=None): _doc_args['isnumeric'] = dict(type='numeric', method='isnumeric') _doc_args['isdecimal'] = dict(type='decimal', method='isdecimal') isalnum = _noarg_wrapper(lambda x: x.isalnum(), + name='isalnum', docstring=_shared_docs['ismethods'] % _doc_args['isalnum']) isalpha = _noarg_wrapper(lambda x: x.isalpha(), + name='isalpha', docstring=_shared_docs['ismethods'] % _doc_args['isalpha']) isdigit = _noarg_wrapper(lambda x: x.isdigit(), + name='isdigit', docstring=_shared_docs['ismethods'] % _doc_args['isdigit']) isspace = _noarg_wrapper(lambda x: x.isspace(), + name='isspace', docstring=_shared_docs['ismethods'] % _doc_args['isspace']) islower = _noarg_wrapper(lambda x: x.islower(), + name='islower', docstring=_shared_docs['ismethods'] % _doc_args['islower']) isupper = _noarg_wrapper(lambda x: x.isupper(), + name='isupper', docstring=_shared_docs['ismethods'] % _doc_args['isupper']) istitle = _noarg_wrapper(lambda x: x.istitle(), + name='istitle', docstring=_shared_docs['ismethods'] % _doc_args['istitle']) isnumeric = _noarg_wrapper(lambda x: x.isnumeric(), + name='isnumeric', docstring=_shared_docs['ismethods'] % _doc_args['isnumeric']) isdecimal = _noarg_wrapper(lambda x: x.isdecimal(), + name='isdecimal', docstring=_shared_docs['ismethods'] % _doc_args['isdecimal']) diff --git a/pandas/tests/test_strings.py b/pandas/tests/test_strings.py index 2951ca24fa7ff..1ba0ef3918fb7 100644 --- a/pandas/tests/test_strings.py +++ b/pandas/tests/test_strings.py @@ -150,6 +150,9 @@ def any_allowed_skipna_inferred_dtype(request): ... inferred_dtype, values = any_allowed_skipna_inferred_dtype ... # will pass ... assert lib.infer_dtype(values, skipna=True) == inferred_dtype + ... + ... # constructor for .str-accessor will also pass + ... pd.Series(values).str """ inferred_dtype, values = request.param values = np.array(values, dtype=object) # object dtype to avoid casting @@ -179,20 +182,6 @@ def test_api_per_dtype(self, box, dtype, any_skipna_inferred_dtype): pytest.xfail(reason='Conversion to numpy array fails because ' 'the ._values-attribute is not a numpy array for ' 'PeriodArray/IntervalArray; see GH 23553') - if box == Index and inferred_dtype in ['empty', 'bytes']: - pytest.xfail(reason='Raising too restrictively; ' - 'solved by GH 23167') - if (box == Index and dtype == object - and inferred_dtype in ['boolean', 'date', 'time']): - pytest.xfail(reason='Inferring incorrectly because of NaNs; ' - 'solved by GH 23167') - if (box == Series - and (dtype == object and inferred_dtype not in [ - 'string', 'unicode', 'empty', - 'bytes', 'mixed', 'mixed-integer']) - or (dtype == 'category' - and inferred_dtype in ['decimal', 'boolean', 'time'])): - pytest.xfail(reason='Not raising correctly; solved by GH 23167') types_passing_constructor = ['string', 'unicode', 'empty', 'bytes', 'mixed', 'mixed-integer'] @@ -220,27 +209,21 @@ def test_api_per_method(self, box, dtype, method_name, args, kwargs = any_string_method # TODO: get rid of these xfails - if (method_name not in ['encode', 'decode', 'len'] - and inferred_dtype == 'bytes'): - pytest.xfail(reason='Not raising for "bytes", see GH 23011;' - 'Also: malformed method names, see GH 23551; ' - 'solved by GH 23167') - if (method_name == 'cat' - and inferred_dtype in ['mixed', 'mixed-integer']): - pytest.xfail(reason='Bad error message; should raise better; ' - 'solved by GH 23167') - if box == Index and inferred_dtype in ['empty', 'bytes']: - pytest.xfail(reason='Raising too restrictively; ' - 'solved by GH 23167') - if (box == Index and dtype == object - and inferred_dtype in ['boolean', 'date', 'time']): - pytest.xfail(reason='Inferring incorrectly because of NaNs; ' - 'solved by GH 23167') + if (method_name in ['partition', 'rpartition'] and box == Index + and inferred_dtype == 'empty'): + pytest.xfail(reason='Method cannot deal with empty Index') + if (method_name == 'split' and box == Index and values.size == 0 + and kwargs.get('expand', None) is not None): + pytest.xfail(reason='Split fails on empty Series when expand=True') + if (method_name == 'get_dummies' and box == Index + and inferred_dtype == 'empty' and (dtype == object + or values.size == 0)): + pytest.xfail(reason='Need to fortify get_dummies corner cases') t = box(values, dtype=dtype) # explicit dtype to avoid casting method = getattr(t.str, method_name) - bytes_allowed = method_name in ['encode', 'decode', 'len'] + bytes_allowed = method_name in ['decode', 'get', 'len', 'slice'] # as of v0.23.4, all methods except 'cat' are very lenient with the # allowed data types, just returning NaN for entries that error. # This could be changed with an 'errors'-kwarg to the `str`-accessor, @@ -3167,7 +3150,8 @@ def test_str_accessor_no_new_attributes(self): def test_method_on_bytes(self): lhs = Series(np.array(list('abc'), 'S1').astype(object)) rhs = Series(np.array(list('def'), 'S1').astype(object)) - with pytest.raises(TypeError, match="can't concat str to bytes"): + with pytest.raises(TypeError, + match="Cannot use .str.cat with values of.*"): lhs.str.cat(rhs) def test_casefold(self): From 0dbb99efc259c5182ac88f116ebb76ae6e2db6ee Mon Sep 17 00:00:00 2001 From: Marc Garcia Date: Sat, 1 Jun 2019 16:34:57 +0100 Subject: [PATCH 25/51] Changing dev docs ssh key (#26604) --- azure-pipelines.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/azure-pipelines.yml b/azure-pipelines.yml index 9f83917024049..0064d0a932960 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -161,7 +161,7 @@ jobs: - task: InstallSSHKey@0 inputs: hostName: 'github.com' - sshPublicKey: 'ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAACAQDHmz3l/EdqrgNxEUKkwDUuUcLv91unig03pYFGO/DMIgCmPdMG96zAgfnESd837Rm0wSSqylwSzkRJt5MV/TpFlcVifDLDQmUhqCeO8Z6dLl/oe35UKmyYICVwcvQTAaHNnYRpKC5IUlTh0JEtw9fGlnp1Ta7U1ENBLbKdpywczElhZu+hOQ892zqOj3CwA+U2329/d6cd7YnqIKoFN9DWT3kS5K6JE4IoBfQEVekIOs23bKjNLvPoOmi6CroAhu/K8j+NCWQjge5eJf2x/yTnIIP1PlEcXoHIr8io517posIx3TBup+CN8bNS1PpDW3jyD3ttl1uoBudjOQrobNnJeR6Rn67DRkG6IhSwr3BWj8alwUG5mTdZzwV5Pa9KZFdIiqX7NoDGg+itsR39QCn0thK8lGRNSR8KrWC1PSjecwelKBO7uQ7rnk/rkrZdBWR4oEA8YgNH8tirUw5WfOr5a0AIaJicKxGKNdMxZt+zmC+bS7F4YCOGIm9KHa43RrKhoGRhRf9fHHHKUPwFGqtWG4ykcUgoamDOURJyepesBAO3FiRE9rLU6ILbB3yEqqoekborHmAJD5vf7PWItW3Q/YQKuk3kkqRcKnexPyzyyq5lUgTi8CxxZdaASIOu294wjBhhdyHlXEkVTNJ9JKkj/obF+XiIIp0cBDsOXY9hDQ== pandas-dev@python.org' + sshPublicKey: 'ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAACAQDfF0BSddjvZx/z4/2TXsy+RxjwBpgdHkmjtL9WfRHxEw1TchBuEj5vWWcxBNTK+9oVzD/Lca89HAXXrklsfkdAK3LvLfGCxTGpP8t/3CxxFdnSg3EN+4cDGKuDlbeTyzdASdPBOq0GTZjUFekl9ZfFrFJ9SoPpqZ4mmPRPapPrkwTs4xIrBly0eWcISFYgZcG58m65+XQpyyBMbpsO5ZHBBxE8kkWN0yY+gKt5PeeIO82xE+7F+3Qhlc67fKfB4FEitQ5SKrbKyGNNdFtEGcC6CEtD0B0vJxssltQEl5dDWPJP6tH4cIm/J6m28mpSYc5fEBhr75jE4Ybw6NtGgBZEdtFRFlnb91mSiVSjM/HEkV7/xYai+H1Gk+I/8tcl8cf3JCiJSP2glz8bp52+i5it29FUL8ITxdJSo0duUkVm3nZ8cDI6zag+nSSmzdZ1I9Fw7M7RRPHM2zd5+6RskeqamR5lY3Iv+t8Yo8cRX10IiHNF89b+3vI5ZkIKqytrPfrY45jGVMXA6x/whMh94Ac94qm+Do7P3eT/66a1lX0r+UfV6UnfwHE6cZ1ZFX2AzlmSiYMKmTD3hn1GNyHHuvk3Mneanbk4+x+8SjAXIK354zJ8c1Qgk1iEicDvna2IBd94R4tBWjYZ8xH7avmPlhs0HwbjiNOFDc45UXvwIl+D7w== pandas-dev@python.org' sshKeySecureFile: 'pandas_docs_key' displayName: 'Install GitHub ssh deployment key' condition : | From 3db9dc308bad04f180950630f5966cbee27916a7 Mon Sep 17 00:00:00 2001 From: Marc Garcia Date: Sat, 1 Jun 2019 17:46:56 +0100 Subject: [PATCH 26/51] CI: Removing doc build in azure (#26609) --- azure-pipelines.yml | 60 --------------------------------------------- 1 file changed, 60 deletions(-) diff --git a/azure-pipelines.yml b/azure-pipelines.yml index 0064d0a932960..85325c52e7e6d 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -116,63 +116,3 @@ jobs: fi displayName: 'Running benchmarks' condition: true - -- job: 'Docs' - pool: - vmImage: ubuntu-16.04 - timeoutInMinutes: 90 - steps: - - script: | - echo '##vso[task.setvariable variable=CONDA_ENV]pandas-dev' - echo '##vso[task.setvariable variable=ENV_FILE]environment.yml' - displayName: 'Setting environment variables' - - - script: | - export PATH=$HOME/miniconda3/bin:$PATH - sudo apt-get install -y libc6-dev-i386 - ci/setup_env.sh - displayName: 'Setup environment and build pandas' - - - script: | - export PATH=$HOME/miniconda3/bin:$PATH - source activate pandas-dev - doc/make.py - displayName: 'Build documentation' - - - script: | - cd doc/build/html - git init - touch .nojekyll - git add --all . - git config user.email "pandas-dev@python.org" - git config user.name "pandas-docs-bot" - git commit -m "pandas documentation in master" - displayName: 'Create git repo for docs build' - condition : | - and(not(eq(variables['Build.Reason'], 'PullRequest')), - eq(variables['Build.SourceBranch'], 'refs/heads/master')) - - # This task to work requires next steps: - # 1. Got to "Library > Secure files" in the azure-pipelines dashboard: https://dev.azure.com/pandas-dev/pandas/_library?itemType=SecureFiles - # 2. Click on "+ Secure file" - # 3. Upload the private key (the name of the file must match with the specified in "sshKeySecureFile" input below, "pandas_docs_key") - # 4. Click on file name after it is created, tick the box "Authorize for use in all pipelines" and save - # 5. The public key specified in "sshPublicKey" is the pair of the uploaded private key, and needs to be specified as a deploy key of the repo where the docs will be pushed: https://github.com/pandas-dev/pandas-dev.github.io/settings/keys - - task: InstallSSHKey@0 - inputs: - hostName: 'github.com' - sshPublicKey: 'ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAACAQDfF0BSddjvZx/z4/2TXsy+RxjwBpgdHkmjtL9WfRHxEw1TchBuEj5vWWcxBNTK+9oVzD/Lca89HAXXrklsfkdAK3LvLfGCxTGpP8t/3CxxFdnSg3EN+4cDGKuDlbeTyzdASdPBOq0GTZjUFekl9ZfFrFJ9SoPpqZ4mmPRPapPrkwTs4xIrBly0eWcISFYgZcG58m65+XQpyyBMbpsO5ZHBBxE8kkWN0yY+gKt5PeeIO82xE+7F+3Qhlc67fKfB4FEitQ5SKrbKyGNNdFtEGcC6CEtD0B0vJxssltQEl5dDWPJP6tH4cIm/J6m28mpSYc5fEBhr75jE4Ybw6NtGgBZEdtFRFlnb91mSiVSjM/HEkV7/xYai+H1Gk+I/8tcl8cf3JCiJSP2glz8bp52+i5it29FUL8ITxdJSo0duUkVm3nZ8cDI6zag+nSSmzdZ1I9Fw7M7RRPHM2zd5+6RskeqamR5lY3Iv+t8Yo8cRX10IiHNF89b+3vI5ZkIKqytrPfrY45jGVMXA6x/whMh94Ac94qm+Do7P3eT/66a1lX0r+UfV6UnfwHE6cZ1ZFX2AzlmSiYMKmTD3hn1GNyHHuvk3Mneanbk4+x+8SjAXIK354zJ8c1Qgk1iEicDvna2IBd94R4tBWjYZ8xH7avmPlhs0HwbjiNOFDc45UXvwIl+D7w== pandas-dev@python.org' - sshKeySecureFile: 'pandas_docs_key' - displayName: 'Install GitHub ssh deployment key' - condition : | - and(not(eq(variables['Build.Reason'], 'PullRequest')), - eq(variables['Build.SourceBranch'], 'refs/heads/master')) - - - script: | - cd doc/build/html - git remote add origin git@github.com:pandas-dev/pandas-dev.github.io.git - git push origin master -f - displayName: 'Publish docs to GitHub pages' - condition : | - and(not(eq(variables['Build.Reason'], 'PullRequest')), - eq(variables['Build.SourceBranch'], 'refs/heads/master')) From 437efa6e974e506c7cc5f142d5186bf6a7f5ce13 Mon Sep 17 00:00:00 2001 From: topper-123 Date: Sat, 1 Jun 2019 17:03:58 +0000 Subject: [PATCH 27/51] PERF: don't call RangeIndex._data unnecessarily (#26565) --- asv_bench/benchmarks/index_object.py | 6 +++++ doc/source/whatsnew/v0.25.0.rst | 1 + pandas/core/indexes/range.py | 32 +++++++++++++++++++++++-- pandas/tests/indexes/test_range.py | 36 ++++++++++++++++++++++++++++ 4 files changed, 73 insertions(+), 2 deletions(-) diff --git a/asv_bench/benchmarks/index_object.py b/asv_bench/benchmarks/index_object.py index 896a20bae2069..78fe2ae966896 100644 --- a/asv_bench/benchmarks/index_object.py +++ b/asv_bench/benchmarks/index_object.py @@ -94,6 +94,12 @@ def time_min(self): def time_min_trivial(self): self.idx_inc.min() + def time_get_loc_inc(self): + self.idx_inc.get_loc(900000) + + def time_get_loc_dec(self): + self.idx_dec.get_loc(100000) + class IndexAppend: diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index 87a8010998bd0..1619ba1a45739 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -493,6 +493,7 @@ Performance Improvements - Improved performance of :meth:`Series.searchsorted`. The speedup is especially large when the dtype is int8/int16/int32 and the searched key is within the integer bounds for the dtype (:issue:`22034`) - Improved performance of :meth:`pandas.core.groupby.GroupBy.quantile` (:issue:`20405`) +- Improved performance when slicing :class:`RangeIndex` (:issue:`26565`) - Improved performance of :meth:`read_csv` by faster tokenizing and faster parsing of small float numbers (:issue:`25784`) - Improved performance of :meth:`read_csv` by faster parsing of N/A and boolean values (:issue:`25804`) - Improved performance of :meth:`IntervalIndex.is_monotonic`, :meth:`IntervalIndex.is_monotonic_increasing` and :meth:`IntervalIndex.is_monotonic_decreasing` by removing conversion to :class:`MultiIndex` (:issue:`24813`) diff --git a/pandas/core/indexes/range.py b/pandas/core/indexes/range.py index ea14a4c789cd3..9401de3346ccd 100644 --- a/pandas/core/indexes/range.py +++ b/pandas/core/indexes/range.py @@ -22,6 +22,8 @@ from pandas.core.indexes.base import Index, _index_shared_docs from pandas.core.indexes.numeric import Int64Index +from pandas.io.formats.printing import pprint_thing + class RangeIndex(Int64Index): """ @@ -64,6 +66,8 @@ class RangeIndex(Int64Index): _typ = 'rangeindex' _engine_type = libindex.Int64Engine + # check whether self._data has benn called + _cached_data = None # type: np.ndarray # -------------------------------------------------------------------- # Constructors @@ -164,6 +168,8 @@ def _simple_new(cls, start, stop=None, step=None, name=None, for k, v in kwargs.items(): setattr(result, k, v) + result._range = range(result._start, result._stop, result._step) + result._reset_identity() return result @@ -180,9 +186,19 @@ def _constructor(self): """ return the class to use for construction """ return Int64Index - @cache_readonly + @property def _data(self): - return np.arange(self._start, self._stop, self._step, dtype=np.int64) + """ + An int array that for performance reasons is created only when needed. + + The constructed array is saved in ``_cached_data``. This allows us to + check if the array has been created without accessing ``_data`` and + triggering the construction. + """ + if self._cached_data is None: + self._cached_data = np.arange(self._start, self._stop, self._step, + dtype=np.int64) + return self._cached_data @cache_readonly def _int64index(self): @@ -215,6 +231,9 @@ def _format_data(self, name=None): # we are formatting thru the attributes return None + def _format_with_header(self, header, na_rep='NaN', **kwargs): + return header + list(map(pprint_thing, self._range)) + # -------------------------------------------------------------------- @property def start(self): @@ -296,6 +315,15 @@ def is_monotonic_decreasing(self): def has_duplicates(self): return False + @Appender(_index_shared_docs['get_loc']) + def get_loc(self, key, method=None, tolerance=None): + if is_integer(key) and method is None and tolerance is None: + try: + return self._range.index(key) + except ValueError: + raise KeyError(key) + return super().get_loc(key, method=method, tolerance=tolerance) + def tolist(self): return list(range(self._start, self._stop, self._step)) diff --git a/pandas/tests/indexes/test_range.py b/pandas/tests/indexes/test_range.py index b2c330015081c..477a4e527f278 100644 --- a/pandas/tests/indexes/test_range.py +++ b/pandas/tests/indexes/test_range.py @@ -241,6 +241,42 @@ def test_view(self): def test_dtype(self): assert self.index.dtype == np.int64 + def test_cached_data(self): + # GH 26565 + # Calling RangeIndex._data caches an int64 array of the same length at + # self._cached_data. This tests whether _cached_data has been set. + idx = RangeIndex(0, 100, 10) + + assert idx._cached_data is None + + repr(idx) + assert idx._cached_data is None + + str(idx) + assert idx._cached_data is None + + idx.get_loc(20) + assert idx._cached_data is None + + df = pd.DataFrame({'a': range(10)}, index=idx) + + df.loc[50] + assert idx._cached_data is None + + with pytest.raises(KeyError): + df.loc[51] + assert idx._cached_data is None + + df.loc[10:50] + assert idx._cached_data is None + + df.iloc[5:10] + assert idx._cached_data is None + + # actually calling data._data + assert isinstance(idx._data, np.ndarray) + assert isinstance(idx._cached_data, np.ndarray) + def test_is_monotonic(self): assert self.index.is_monotonic is True assert self.index.is_monotonic_increasing is True From addc5fcd95064b765d4ee4260304d44822fdee3b Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Sun, 2 Jun 2019 12:47:34 +0100 Subject: [PATCH 28/51] CI: pin pytest version on Python 3.5 (#26619) --- ci/deps/azure-35-compat.yaml | 2 +- ci/deps/azure-macos-35.yaml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/ci/deps/azure-35-compat.yaml b/ci/deps/azure-35-compat.yaml index d0a48bd3f8b27..e55a4fbdf3fa9 100644 --- a/ci/deps/azure-35-compat.yaml +++ b/ci/deps/azure-35-compat.yaml @@ -26,5 +26,5 @@ dependencies: - pip - pip: # for python 3.5, pytest>=4.0.2 is not available in conda - - pytest>=4.0.2 + - pytest==4.5.0 - html5lib==1.0b2 diff --git a/ci/deps/azure-macos-35.yaml b/ci/deps/azure-macos-35.yaml index 591266348a5f1..00c2051f29760 100644 --- a/ci/deps/azure-macos-35.yaml +++ b/ci/deps/azure-macos-35.yaml @@ -25,7 +25,7 @@ dependencies: - pip: - python-dateutil==2.5.3 # universal - - pytest>=4.0.2 + - pytest==4.5.0 - pytest-xdist - pytest-mock - hypothesis>=3.58.0 From 33552913376cf23a890d482d8661b3140062496c Mon Sep 17 00:00:00 2001 From: Chuanzhu Xu Date: Sun, 2 Jun 2019 17:09:44 -0400 Subject: [PATCH 29/51] remove outdated gtk package from code (#26590) --- doc/source/install.rst | 1 - doc/source/user_guide/io.rst | 2 +- doc/source/whatsnew/v0.25.0.rst | 1 + pandas/core/generic.py | 2 +- pandas/io/clipboard/__init__.py | 21 +++++---------------- pandas/io/clipboard/clipboards.py | 16 ---------------- pandas/io/clipboards.py | 2 +- 7 files changed, 9 insertions(+), 36 deletions(-) diff --git a/doc/source/install.rst b/doc/source/install.rst index b3b5945cc515e..98443ede2e965 100644 --- a/doc/source/install.rst +++ b/doc/source/install.rst @@ -281,7 +281,6 @@ Optional Dependencies `qtpy `__ (requires PyQt or PySide), `PyQt5 `__, `PyQt4 `__, - `pygtk `__, `xsel `__, or `xclip `__: necessary to use :func:`~pandas.read_clipboard`. Most package managers on Linux distributions will have ``xclip`` and/or ``xsel`` immediately available for installation. diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst index 88d8ccbbe036e..4aacb6fa1e278 100644 --- a/doc/source/user_guide/io.rst +++ b/doc/source/user_guide/io.rst @@ -3272,7 +3272,7 @@ We can see that we got the same content back, which we had earlier written to th .. note:: - You may need to install xclip or xsel (with gtk, PyQt5, PyQt4 or qtpy) on Linux to use these methods. + You may need to install xclip or xsel (with PyQt5, PyQt4 or qtpy) on Linux to use these methods. .. _io.pickle: diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index 1619ba1a45739..f122c73325b7d 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -434,6 +434,7 @@ Other API Changes - The ``arg`` argument in :meth:`pandas.core.groupby.DataFrameGroupBy.agg` has been renamed to ``func`` (:issue:`26089`) - The ``arg`` argument in :meth:`pandas.core.window._Window.aggregate` has been renamed to ``func`` (:issue:`26372`) - Most Pandas classes had a ``__bytes__`` method, which was used for getting a python2-style bytestring representation of the object. This method has been removed as a part of dropping Python2 (:issue:`26447`) +- Removed support of gtk package for clipboards (:issue:`26563`) .. _whatsnew_0250.deprecations: diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 7ca2c52e18c41..33b0035e74913 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -2679,7 +2679,7 @@ def to_clipboard(self, excel=True, sep=None, **kwargs): ----- Requirements for your platform. - - Linux : `xclip`, or `xsel` (with `gtk` or `PyQt4` modules) + - Linux : `xclip`, or `xsel` (with `PyQt4` modules) - Windows : none - OS X : none diff --git a/pandas/io/clipboard/__init__.py b/pandas/io/clipboard/__init__.py index b76a843e3e7f2..2063978c76c5a 100644 --- a/pandas/io/clipboard/__init__.py +++ b/pandas/io/clipboard/__init__.py @@ -18,21 +18,19 @@ On Linux, install xclip or xsel via package manager. For example, in Debian: sudo apt-get install xclip -Otherwise on Linux, you will need the gtk, qtpy or PyQt modules installed. +Otherwise on Linux, you will need the qtpy or PyQt modules installed. qtpy also requires a python-qt-bindings module: PyQt4, PyQt5, PySide, PySide2 -gtk and PyQt4 modules are not available for Python 3, -and this module does not work with PyGObject yet. +This module does not work with PyGObject yet. """ __version__ = '1.5.27' import platform import os import subprocess -from .clipboards import (init_osx_clipboard, - init_gtk_clipboard, init_qt_clipboard, - init_xclip_clipboard, init_xsel_clipboard, - init_klipper_clipboard, init_no_clipboard) +from .clipboards import ( + init_osx_clipboard, init_qt_clipboard, init_xclip_clipboard, + init_xsel_clipboard, init_klipper_clipboard, init_no_clipboard) from .windows import init_windows_clipboard # `import qtpy` sys.exit()s if DISPLAY is not in the environment. @@ -60,14 +58,6 @@ def determine_clipboard(): return init_osx_clipboard() if HAS_DISPLAY: # Determine which command/module is installed, if any. - try: - # Check if gtk is installed - import gtk # noqa - except ImportError: - pass - else: - return init_gtk_clipboard() - try: # qtpy is a small abstraction layer that lets you write # applications using a single api call to either PyQt or PySide @@ -104,7 +94,6 @@ def set_clipboard(clipboard): global copy, paste clipboard_types = {'osx': init_osx_clipboard, - 'gtk': init_gtk_clipboard, 'qt': init_qt_clipboard, 'xclip': init_xclip_clipboard, 'xsel': init_xsel_clipboard, diff --git a/pandas/io/clipboard/clipboards.py b/pandas/io/clipboard/clipboards.py index 66e2e35bf0c59..52abdeafb5ecc 100644 --- a/pandas/io/clipboard/clipboards.py +++ b/pandas/io/clipboard/clipboards.py @@ -22,22 +22,6 @@ def paste_osx(): return copy_osx, paste_osx -def init_gtk_clipboard(): - import gtk - - def copy_gtk(text): - global cb - cb = gtk.Clipboard() - cb.set_text(text) - cb.store() - - def paste_gtk(): - clipboardContents = gtk.Clipboard().wait_for_text() - return clipboardContents - - return copy_gtk, paste_gtk - - def init_qt_clipboard(): # $DISPLAY should exist diff --git a/pandas/io/clipboards.py b/pandas/io/clipboards.py index be1256edf7afe..dc30285895dd5 100644 --- a/pandas/io/clipboards.py +++ b/pandas/io/clipboards.py @@ -91,7 +91,7 @@ def to_clipboard(obj, excel=True, sep=None, **kwargs): # pragma: no cover Notes ----- Requirements for your platform - - Linux: xclip, or xsel (with gtk or PyQt4 modules) + - Linux: xclip, or xsel (with PyQt4 modules) - Windows: - OS X: """ From efc2adaa3553f647737307aec85399b627002c03 Mon Sep 17 00:00:00 2001 From: iamshwin <23633545+iamshwin@users.noreply.github.com> Date: Mon, 3 Jun 2019 00:11:48 +0100 Subject: [PATCH 30/51] Tidy documentation about plotting Series histograms (#26624) --- pandas/plotting/_core.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/pandas/plotting/_core.py b/pandas/plotting/_core.py index fed4b0d90983c..3f6a30c4639bc 100644 --- a/pandas/plotting/_core.py +++ b/pandas/plotting/_core.py @@ -2477,8 +2477,6 @@ def hist_series(self, by=None, ax=None, grid=True, xlabelsize=None, bin edges are calculated and returned. If bins is a sequence, gives bin edges, including left edge of first bin and right edge of last bin. In this case, bins is returned unmodified. - bins : integer, default 10 - Number of histogram bins to be used `**kwds` : keywords To be passed to the actual plotting function From 5c6dd43e3e85235f32444df73abb66528336b319 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Mon, 3 Jun 2019 00:13:08 +0100 Subject: [PATCH 31/51] TST/CLN: deduplicate fixture from test_to_latex.py (#26603) --- pandas/conftest.py | 31 ++++++++++++++++++++++++ pandas/tests/frame/conftest.py | 29 ---------------------- pandas/tests/io/formats/test_to_latex.py | 27 +++++++++------------ 3 files changed, 42 insertions(+), 45 deletions(-) diff --git a/pandas/conftest.py b/pandas/conftest.py index 8f71028f51ab4..09fe8e0829fa1 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -12,6 +12,8 @@ import pandas.util._test_decorators as td import pandas as pd +from pandas import DataFrame +import pandas.util.testing as tm hypothesis.settings.register_profile( "ci", @@ -690,3 +692,32 @@ def tick_classes(request): normalize=st.booleans(), startingMonth=st.integers(min_value=1, max_value=12) )) + + +@pytest.fixture +def float_frame(): + """ + Fixture for DataFrame of floats with index of unique strings + + Columns are ['A', 'B', 'C', 'D']. + + A B C D + P7GACiRnxd -0.465578 -0.361863 0.886172 -0.053465 + qZKh6afn8n -0.466693 -0.373773 0.266873 1.673901 + tkp0r6Qble 0.148691 -0.059051 0.174817 1.598433 + wP70WOCtv8 0.133045 -0.581994 -0.992240 0.261651 + M2AeYQMnCz -1.207959 -0.185775 0.588206 0.563938 + QEPzyGDYDo -0.381843 -0.758281 0.502575 -0.565053 + r78Jwns6dn -0.653707 0.883127 0.682199 0.206159 + ... ... ... ... ... + IHEGx9NO0T -0.277360 0.113021 -1.018314 0.196316 + lPMj8K27FA -1.313667 -0.604776 -1.305618 -0.863999 + qa66YMWQa5 1.110525 0.475310 -0.747865 0.032121 + yOa0ATsmcE -0.431457 0.067094 0.096567 -0.264962 + 65znX3uRNG 1.528446 0.160416 -0.109635 -0.032987 + eCOBvKqf3e 0.235281 1.622222 0.781255 0.392871 + xSucinXxuV -1.263557 0.252799 -0.552247 0.400426 + + [30 rows x 4 columns] + """ + return DataFrame(tm.getSeriesData()) diff --git a/pandas/tests/frame/conftest.py b/pandas/tests/frame/conftest.py index c451cd58f1497..d8a590bc492a4 100644 --- a/pandas/tests/frame/conftest.py +++ b/pandas/tests/frame/conftest.py @@ -5,35 +5,6 @@ import pandas.util.testing as tm -@pytest.fixture -def float_frame(): - """ - Fixture for DataFrame of floats with index of unique strings - - Columns are ['A', 'B', 'C', 'D']. - - A B C D - P7GACiRnxd -0.465578 -0.361863 0.886172 -0.053465 - qZKh6afn8n -0.466693 -0.373773 0.266873 1.673901 - tkp0r6Qble 0.148691 -0.059051 0.174817 1.598433 - wP70WOCtv8 0.133045 -0.581994 -0.992240 0.261651 - M2AeYQMnCz -1.207959 -0.185775 0.588206 0.563938 - QEPzyGDYDo -0.381843 -0.758281 0.502575 -0.565053 - r78Jwns6dn -0.653707 0.883127 0.682199 0.206159 - ... ... ... ... ... - IHEGx9NO0T -0.277360 0.113021 -1.018314 0.196316 - lPMj8K27FA -1.313667 -0.604776 -1.305618 -0.863999 - qa66YMWQa5 1.110525 0.475310 -0.747865 0.032121 - yOa0ATsmcE -0.431457 0.067094 0.096567 -0.264962 - 65znX3uRNG 1.528446 0.160416 -0.109635 -0.032987 - eCOBvKqf3e 0.235281 1.622222 0.781255 0.392871 - xSucinXxuV -1.263557 0.252799 -0.552247 0.400426 - - [30 rows x 4 columns] - """ - return DataFrame(tm.getSeriesData()) - - @pytest.fixture def float_frame_with_na(): """ diff --git a/pandas/tests/io/formats/test_to_latex.py b/pandas/tests/io/formats/test_to_latex.py index 5a6511fbd20ee..b9f28ec36d021 100644 --- a/pandas/tests/io/formats/test_to_latex.py +++ b/pandas/tests/io/formats/test_to_latex.py @@ -8,19 +8,14 @@ from pandas.util import testing as tm -@pytest.fixture -def frame(): - return DataFrame(tm.getSeriesData()) - - class TestToLatex: - def test_to_latex_filename(self, frame): + def test_to_latex_filename(self, float_frame): with tm.ensure_clean('test.tex') as path: - frame.to_latex(path) + float_frame.to_latex(path) with open(path, 'r') as f: - assert frame.to_latex() == f.read() + assert float_frame.to_latex() == f.read() # test with utf-8 and encoding option (GH 7061) df = DataFrame([['au\xdfgangen']]) @@ -35,9 +30,9 @@ def test_to_latex_filename(self, frame): with codecs.open(path, 'r', encoding='utf-8') as f: assert df.to_latex() == f.read() - def test_to_latex(self, frame): + def test_to_latex(self, float_frame): # it works! - frame.to_latex() + float_frame.to_latex() df = DataFrame({'a': [1, 2], 'b': ['b1', 'b2']}) withindex_result = df.to_latex() @@ -66,9 +61,9 @@ def test_to_latex(self, frame): assert withoutindex_result == withoutindex_expected - def test_to_latex_format(self, frame): + def test_to_latex_format(self, float_frame): # GH Bug #9402 - frame.to_latex(column_format='ccc') + float_frame.to_latex(column_format='ccc') df = DataFrame({'a': [1, 2], 'b': ['b1', 'b2']}) withindex_result = df.to_latex(column_format='ccc') @@ -389,8 +384,8 @@ def test_to_latex_special_escape(self): """ assert escaped_result == escaped_expected - def test_to_latex_longtable(self, frame): - frame.to_latex(longtable=True) + def test_to_latex_longtable(self, float_frame): + float_frame.to_latex(longtable=True) df = DataFrame({'a': [1, 2], 'b': ['b1', 'b2']}) withindex_result = df.to_latex(longtable=True) @@ -535,9 +530,9 @@ def test_to_latex_specified_header(self): with pytest.raises(ValueError): df.to_latex(header=['A']) - def test_to_latex_decimal(self, frame): + def test_to_latex_decimal(self, float_frame): # GH 12031 - frame.to_latex() + float_frame.to_latex() df = DataFrame({'a': [1.0, 2.1], 'b': ['b1', 'b2']}) withindex_result = df.to_latex(decimal=',') From 4f332f6f4b27111c9ab7ba686b3bc51db2e6f7bc Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Sun, 2 Jun 2019 16:20:15 -0700 Subject: [PATCH 32/51] CLN: Remove convert_objects (#26612) --- doc/source/reference/frame.rst | 1 - doc/source/reference/series.rst | 1 - doc/source/whatsnew/v0.25.0.rst | 1 + pandas/core/generic.py | 48 +--------- pandas/tests/series/test_internals.py | 125 -------------------------- 5 files changed, 2 insertions(+), 174 deletions(-) diff --git a/doc/source/reference/frame.rst b/doc/source/reference/frame.rst index dfa475684c834..b4fb85c028b3e 100644 --- a/doc/source/reference/frame.rst +++ b/doc/source/reference/frame.rst @@ -48,7 +48,6 @@ Conversion :toctree: api/ DataFrame.astype - DataFrame.convert_objects DataFrame.infer_objects DataFrame.copy DataFrame.isna diff --git a/doc/source/reference/series.rst b/doc/source/reference/series.rst index b406893e3414a..8fccdea979602 100644 --- a/doc/source/reference/series.rst +++ b/doc/source/reference/series.rst @@ -56,7 +56,6 @@ Conversion Series.astype Series.infer_objects - Series.convert_objects Series.copy Series.bool Series.to_numpy diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index f122c73325b7d..1cbec223008c4 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -483,6 +483,7 @@ Removal of prior version deprecations/changes - Removed the previously deprecated ``TimeGrouper`` (:issue:`16942`) - Removed the previously deprecated ``parse_cols`` keyword in :func:`read_excel` (:issue:`16488`) - Removed the previously deprecated ``pd.options.html.border`` (:issue:`16970`) +- Removed the previously deprecated ``convert_objects`` (:issue:`11221`) .. _whatsnew_0250.performance: diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 33b0035e74913..2428bbad7003b 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -113,7 +113,7 @@ class NDFrame(PandasObject, SelectionMixin): _internal_names_set = set(_internal_names) # type: Set[str] _accessors = set() # type: Set[str] _deprecations = frozenset([ - 'as_blocks', 'blocks', 'convert_objects', 'is_copy' + 'as_blocks', 'blocks', 'is_copy' ]) # type: FrozenSet[str] _metadata = [] # type: List[str] _is_copy = None @@ -5913,52 +5913,6 @@ def _convert(self, datetime=False, numeric=False, timedelta=False, timedelta=timedelta, coerce=coerce, copy=copy)).__finalize__(self) - def convert_objects(self, convert_dates=True, convert_numeric=False, - convert_timedeltas=True, copy=True): - """ - Attempt to infer better dtype for object columns. - - .. deprecated:: 0.21.0 - - Parameters - ---------- - convert_dates : boolean, default True - If True, convert to date where possible. If 'coerce', force - conversion, with unconvertible values becoming NaT. - convert_numeric : boolean, default False - If True, attempt to coerce to numbers (including strings), with - unconvertible values becoming NaN. - convert_timedeltas : boolean, default True - If True, convert to timedelta where possible. If 'coerce', force - conversion, with unconvertible values becoming NaT. - copy : boolean, default True - If True, return a copy even if no copy is necessary (e.g. no - conversion was done). Note: This is meant for internal use, and - should not be confused with inplace. - - Returns - ------- - converted : same as input object - - See Also - -------- - to_datetime : Convert argument to datetime. - to_timedelta : Convert argument to timedelta. - to_numeric : Convert argument to numeric type. - """ - msg = ("convert_objects is deprecated. To re-infer data dtypes for " - "object columns, use {klass}.infer_objects()\nFor all " - "other conversions use the data-type specific converters " - "pd.to_datetime, pd.to_timedelta and pd.to_numeric." - ).format(klass=self.__class__.__name__) - warnings.warn(msg, FutureWarning, stacklevel=2) - - return self._constructor( - self._data.convert(convert_dates=convert_dates, - convert_numeric=convert_numeric, - convert_timedeltas=convert_timedeltas, - copy=copy)).__finalize__(self) - def infer_objects(self): """ Attempt to infer better dtypes for object columns. diff --git a/pandas/tests/series/test_internals.py b/pandas/tests/series/test_internals.py index f6f4a2db359f7..29846f10dae33 100644 --- a/pandas/tests/series/test_internals.py +++ b/pandas/tests/series/test_internals.py @@ -12,131 +12,6 @@ class TestSeriesInternals: - def test_convert_objects(self): - - s = Series([1., 2, 3], index=['a', 'b', 'c']) - with tm.assert_produces_warning(FutureWarning): - result = s.convert_objects(convert_dates=False, - convert_numeric=True) - assert_series_equal(result, s) - - # force numeric conversion - r = s.copy().astype('O') - r['a'] = '1' - with tm.assert_produces_warning(FutureWarning): - result = r.convert_objects(convert_dates=False, - convert_numeric=True) - assert_series_equal(result, s) - - r = s.copy().astype('O') - r['a'] = '1.' - with tm.assert_produces_warning(FutureWarning): - result = r.convert_objects(convert_dates=False, - convert_numeric=True) - assert_series_equal(result, s) - - r = s.copy().astype('O') - r['a'] = 'garbled' - expected = s.copy() - expected['a'] = np.nan - with tm.assert_produces_warning(FutureWarning): - result = r.convert_objects(convert_dates=False, - convert_numeric=True) - assert_series_equal(result, expected) - - # GH 4119, not converting a mixed type (e.g.floats and object) - s = Series([1, 'na', 3, 4]) - with tm.assert_produces_warning(FutureWarning): - result = s.convert_objects(convert_numeric=True) - expected = Series([1, np.nan, 3, 4]) - assert_series_equal(result, expected) - - s = Series([1, '', 3, 4]) - with tm.assert_produces_warning(FutureWarning): - result = s.convert_objects(convert_numeric=True) - expected = Series([1, np.nan, 3, 4]) - assert_series_equal(result, expected) - - # dates - s = Series([datetime(2001, 1, 1, 0, 0), datetime(2001, 1, 2, 0, 0), - datetime(2001, 1, 3, 0, 0)]) - s2 = Series([datetime(2001, 1, 1, 0, 0), datetime(2001, 1, 2, 0, 0), - datetime(2001, 1, 3, 0, 0), 'foo', 1.0, 1, - Timestamp('20010104'), '20010105'], - dtype='O') - with tm.assert_produces_warning(FutureWarning): - result = s.convert_objects(convert_dates=True, - convert_numeric=False) - expected = Series([Timestamp('20010101'), Timestamp('20010102'), - Timestamp('20010103')], dtype='M8[ns]') - assert_series_equal(result, expected) - - with tm.assert_produces_warning(FutureWarning): - result = s.convert_objects(convert_dates='coerce', - convert_numeric=False) - with tm.assert_produces_warning(FutureWarning): - result = s.convert_objects(convert_dates='coerce', - convert_numeric=True) - assert_series_equal(result, expected) - - expected = Series([Timestamp('20010101'), Timestamp('20010102'), - Timestamp('20010103'), - NaT, NaT, NaT, Timestamp('20010104'), - Timestamp('20010105')], dtype='M8[ns]') - with tm.assert_produces_warning(FutureWarning): - result = s2.convert_objects(convert_dates='coerce', - convert_numeric=False) - assert_series_equal(result, expected) - with tm.assert_produces_warning(FutureWarning): - result = s2.convert_objects(convert_dates='coerce', - convert_numeric=True) - assert_series_equal(result, expected) - - # preserver all-nans (if convert_dates='coerce') - s = Series(['foo', 'bar', 1, 1.0], dtype='O') - with tm.assert_produces_warning(FutureWarning): - result = s.convert_objects(convert_dates='coerce', - convert_numeric=False) - expected = Series([NaT] * 2 + [Timestamp(1)] * 2) - assert_series_equal(result, expected) - - # preserver if non-object - s = Series([1], dtype='float32') - with tm.assert_produces_warning(FutureWarning): - result = s.convert_objects(convert_dates='coerce', - convert_numeric=False) - assert_series_equal(result, s) - - # r = s.copy() - # r[0] = np.nan - # result = r.convert_objects(convert_dates=True,convert_numeric=False) - # assert result.dtype == 'M8[ns]' - - # dateutil parses some single letters into today's value as a date - for x in 'abcdefghijklmnopqrstuvwxyz': - s = Series([x]) - with tm.assert_produces_warning(FutureWarning): - result = s.convert_objects(convert_dates='coerce') - assert_series_equal(result, s) - s = Series([x.upper()]) - with tm.assert_produces_warning(FutureWarning): - result = s.convert_objects(convert_dates='coerce') - assert_series_equal(result, s) - - def test_convert_objects_preserve_bool(self): - s = Series([1, True, 3, 5], dtype=object) - with tm.assert_produces_warning(FutureWarning): - r = s.convert_objects(convert_numeric=True) - e = Series([1, 1, 3, 5], dtype='i8') - tm.assert_series_equal(r, e) - - def test_convert_objects_preserve_all_bool(self): - s = Series([False, True, False, False], dtype=object) - with tm.assert_produces_warning(FutureWarning): - r = s.convert_objects(convert_numeric=True) - e = Series([False, True, False, False], dtype=bool) - tm.assert_series_equal(r, e) - # GH 10265 def test_convert(self): # Tests: All to nans, coerce, true From 0e3bf7f3478ffb85d64e795d72888bdb9bd9cb4b Mon Sep 17 00:00:00 2001 From: h-vetinari <33685575+h-vetinari@users.noreply.github.com> Date: Mon, 3 Jun 2019 01:34:27 +0200 Subject: [PATCH 33/51] Clean up ufuncs post numpy bump (#26606) --- pandas/core/arrays/sparse.py | 9 --------- pandas/core/sparse/frame.py | 6 ------ pandas/core/sparse/series.py | 20 -------------------- 3 files changed, 35 deletions(-) diff --git a/pandas/core/arrays/sparse.py b/pandas/core/arrays/sparse.py index ecc06db2bd07b..926ed6a829a6d 100644 --- a/pandas/core/arrays/sparse.py +++ b/pandas/core/arrays/sparse.py @@ -573,7 +573,6 @@ class SparseArray(PandasObject, ExtensionArray, ExtensionOpsMixin): Whether to explicitly copy the incoming `data` array. """ - __array_priority__ = 15 _pandas_ftype = 'sparse' _subtyp = 'sparse_array' # register ABCSparseArray @@ -1639,14 +1638,6 @@ def T(self): # Ufuncs # ------------------------------------------------------------------------ - def __array_wrap__(self, array, context=None): - from pandas.core.dtypes.generic import ABCSparseSeries - - ufunc, inputs, _ = context - inputs = tuple(x.to_dense() if isinstance(x, ABCSparseSeries) else x - for x in inputs) - return self.__array_ufunc__(ufunc, '__call__', *inputs) - _HANDLED_TYPES = (np.ndarray, numbers.Number) def __array_ufunc__(self, ufunc, method, *inputs, **kwargs): diff --git a/pandas/core/sparse/frame.py b/pandas/core/sparse/frame.py index bf1cec7571f4d..0320da6d9a48d 100644 --- a/pandas/core/sparse/frame.py +++ b/pandas/core/sparse/frame.py @@ -242,12 +242,6 @@ def _init_spmatrix(self, data, index, columns, dtype=None, def to_coo(self): return SparseFrameAccessor(self).to_coo() - def __array_wrap__(self, result): - return self._constructor( - result, index=self.index, columns=self.columns, - default_kind=self._default_kind, - default_fill_value=self._default_fill_value).__finalize__(self) - def __getstate__(self): # pickling return dict(_typ=self._typ, _subtyp=self._subtyp, _data=self._data, diff --git a/pandas/core/sparse/series.py b/pandas/core/sparse/series.py index 3f95acdbfb42c..3814d8bb66635 100644 --- a/pandas/core/sparse/series.py +++ b/pandas/core/sparse/series.py @@ -124,26 +124,6 @@ def __array_ufunc__(self, ufunc, method, *inputs, **kwargs): fill_value=result.fill_value, copy=False).__finalize__(self) - def __array_wrap__(self, result, context=None): - """ - Gets called prior to a ufunc (and after) - - See SparseArray.__array_wrap__ for detail. - """ - result = self.values.__array_wrap__(result, context=context) - return self._constructor(result, index=self.index, - sparse_index=self.sp_index, - fill_value=result.fill_value, - copy=False).__finalize__(self) - - def __array_finalize__(self, obj): - """ - Gets called after any ufunc or other array operations, necessary - to pass on the index. - """ - self.name = getattr(obj, 'name', None) - self.fill_value = getattr(obj, 'fill_value', None) - # unary ops # TODO: See if this can be shared def __pos__(self): From 635458029e11ff6d94e8132577075269fb79832c Mon Sep 17 00:00:00 2001 From: Frank Hoang Date: Sun, 2 Jun 2019 18:42:54 -0500 Subject: [PATCH 34/51] Add more specific error message when user passes incorrect matrix format to from_coo (#26584) --- doc/source/whatsnew/v0.25.0.rst | 2 +- pandas/core/sparse/scipy_sparse.py | 11 ++++++++++- pandas/tests/arrays/sparse/test_accessor.py | 10 ++++++++++ 3 files changed, 21 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index 1cbec223008c4..461c883f542ab 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -694,7 +694,7 @@ Sparse - Significant speedup in :class:`SparseArray` initialization that benefits most operations, fixing performance regression introduced in v0.20.0 (:issue:`24985`) - Bug in :class:`SparseFrame` constructor where passing ``None`` as the data would cause ``default_fill_value`` to be ignored (:issue:`16807`) - Bug in :class:`SparseDataFrame` when adding a column in which the length of values does not match length of index, ``AssertionError`` is raised instead of raising ``ValueError`` (:issue:`25484`) - +- Introduce a better error message in :meth:`Series.sparse.from_coo` so it returns a ``TypeError`` for inputs that are not coo matrices (:issue:`26554`) Other ^^^^^ diff --git a/pandas/core/sparse/scipy_sparse.py b/pandas/core/sparse/scipy_sparse.py index 7630983421ff9..0dd8958e93c13 100644 --- a/pandas/core/sparse/scipy_sparse.py +++ b/pandas/core/sparse/scipy_sparse.py @@ -130,10 +130,19 @@ def _coo_to_sparse_series(A, dense_index: bool = False, Returns ------- Series or SparseSeries + + Raises + ------ + TypeError if A is not a coo_matrix + """ from pandas import SparseDtype - s = Series(A.data, MultiIndex.from_arrays((A.row, A.col))) + try: + s = Series(A.data, MultiIndex.from_arrays((A.row, A.col))) + except AttributeError: + raise TypeError('Expected coo_matrix. Got {} instead.' + .format(type(A).__name__)) s = s.sort_index() if sparse_series: # TODO(SparseSeries): remove this and the sparse_series keyword. diff --git a/pandas/tests/arrays/sparse/test_accessor.py b/pandas/tests/arrays/sparse/test_accessor.py index 370d222c1ab4e..d0a188a8aff3c 100644 --- a/pandas/tests/arrays/sparse/test_accessor.py +++ b/pandas/tests/arrays/sparse/test_accessor.py @@ -119,3 +119,13 @@ def test_series_from_coo(self, dtype, dense_index): ) tm.assert_series_equal(result, expected) + + @td.skip_if_no_scipy + def test_series_from_coo_incorrect_format_raises(self): + # gh-26554 + import scipy.sparse + m = scipy.sparse.csr_matrix(np.array([[0, 1], [0, 0]])) + with pytest.raises(TypeError, + match='Expected coo_matrix. Got csr_matrix instead.' + ): + pd.Series.sparse.from_coo(m) From 23b0788118bd95bdf1adb8f86d667fa54a033423 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Mon, 3 Jun 2019 07:35:25 +0200 Subject: [PATCH 35/51] DOC/CI: restore travis CI doc build environment (#26621) --- .travis.yml | 4 ++-- ci/deps/travis-36-doc.yaml | 46 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 48 insertions(+), 2 deletions(-) create mode 100644 ci/deps/travis-36-doc.yaml diff --git a/.travis.yml b/.travis.yml index 90dd904e6cb1e..ce8817133a477 100644 --- a/.travis.yml +++ b/.travis.yml @@ -51,14 +51,14 @@ matrix: # In allow_failures - dist: trusty env: - - JOB="3.6, doc" ENV_FILE="environment.yml" DOC=true + - JOB="3.6, doc" ENV_FILE="ci/deps/travis-36-doc.yaml" DOC=true allow_failures: - dist: trusty env: - JOB="3.6, slow" ENV_FILE="ci/deps/travis-36-slow.yaml" PATTERN="slow" - dist: trusty env: - - JOB="3.6, doc" ENV_FILE="environment.yml" DOC=true + - JOB="3.6, doc" ENV_FILE="ci/deps/travis-36-doc.yaml" DOC=true before_install: - echo "before_install" diff --git a/ci/deps/travis-36-doc.yaml b/ci/deps/travis-36-doc.yaml new file mode 100644 index 0000000000000..9d6cbd82fdc05 --- /dev/null +++ b/ci/deps/travis-36-doc.yaml @@ -0,0 +1,46 @@ +name: pandas-dev +channels: + - defaults + - conda-forge +dependencies: + - beautifulsoup4 + - bottleneck + - cython>=0.28.2 + - fastparquet>=0.2.1 + - gitpython + - html5lib + - hypothesis>=3.58.0 + - ipykernel + - ipython + - ipywidgets + - lxml + - matplotlib + - nbconvert>=5.4.1 + - nbformat + - nbsphinx + - notebook>=5.7.5 + - numexpr + - numpy + - numpydoc + - openpyxl + - pandoc + - pyarrow + - pyqt + - pytables + - python-dateutil + - python-snappy + - python=3.6.* + - pytz + - scipy + - seaborn + - sphinx + - sqlalchemy + - statsmodels + - xarray + - xlrd + - xlsxwriter + - xlwt + # universal + - pytest>=4.0.2 + - pytest-xdist + - isort From 8d124ea4c5200f218db7cea8e3ff504b0045a4e6 Mon Sep 17 00:00:00 2001 From: h-vetinari <33685575+h-vetinari@users.noreply.github.com> Date: Mon, 3 Jun 2019 13:56:29 +0200 Subject: [PATCH 36/51] TST/API: Forbid str-accessor for 1-level MultiIndex (#26608) --- doc/source/whatsnew/v0.25.0.rst | 1 + pandas/tests/test_strings.py | 8 ++++++++ 2 files changed, 9 insertions(+) diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index 461c883f542ab..0e8cd95084a8d 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -434,6 +434,7 @@ Other API Changes - The ``arg`` argument in :meth:`pandas.core.groupby.DataFrameGroupBy.agg` has been renamed to ``func`` (:issue:`26089`) - The ``arg`` argument in :meth:`pandas.core.window._Window.aggregate` has been renamed to ``func`` (:issue:`26372`) - Most Pandas classes had a ``__bytes__`` method, which was used for getting a python2-style bytestring representation of the object. This method has been removed as a part of dropping Python2 (:issue:`26447`) +- The `.str`-accessor has been disabled for 1-level :class:`MultiIndex`, use :meth:`MultiIndex.to_flat_index` if necessary (:issue:`23679`) - Removed support of gtk package for clipboards (:issue:`26563`) .. _whatsnew_0250.deprecations: diff --git a/pandas/tests/test_strings.py b/pandas/tests/test_strings.py index 1ba0ef3918fb7..a1d522930e9aa 100644 --- a/pandas/tests/test_strings.py +++ b/pandas/tests/test_strings.py @@ -169,6 +169,14 @@ def test_api(self): assert Series.str is strings.StringMethods assert isinstance(Series(['']).str, strings.StringMethods) + def test_api_mi_raises(self): + # GH 23679 + mi = MultiIndex.from_arrays([['a', 'b', 'c']]) + with pytest.raises(AttributeError, match='Can only use .str accessor ' + 'with Index, not MultiIndex'): + mi.str + assert not hasattr(mi, 'str') + @pytest.mark.parametrize('dtype', [object, 'category']) @pytest.mark.parametrize('box', [Series, Index]) def test_api_per_dtype(self, box, dtype, any_skipna_inferred_dtype): From 101370645d13e1d0f256f367f4ef56a8329b56b6 Mon Sep 17 00:00:00 2001 From: topper-123 Date: Mon, 3 Jun 2019 22:17:40 +0000 Subject: [PATCH 37/51] Minor doc cleanup because of Panel removal (#26638) --- doc/source/getting_started/basics.rst | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/doc/source/getting_started/basics.rst b/doc/source/getting_started/basics.rst index 80e334054a986..5ec0094de0a91 100644 --- a/doc/source/getting_started/basics.rst +++ b/doc/source/getting_started/basics.rst @@ -1455,9 +1455,8 @@ Iteration The behavior of basic iteration over pandas objects depends on the type. When iterating over a Series, it is regarded as array-like, and basic iteration -produces the values. Other data structures, like DataFrame, -follow the dict-like convention of iterating over the "keys" of the -objects. +produces the values. DataFrames follow the dict-like convention of iterating +over the "keys" of the objects. In short, basic iteration (``for i in object``) produces: @@ -1537,9 +1536,9 @@ For example: .. ipython:: python - for item, frame in df.iteritems(): - print(item) - print(frame) + for label, ser in df.iteritems(): + print(label) + print(ser) .. _basics.iterrows: From 454b8c5cdcea0cbba981d607293b990cc704f3a1 Mon Sep 17 00:00:00 2001 From: Jeremy Schendel Date: Mon, 3 Jun 2019 19:23:49 -0600 Subject: [PATCH 38/51] DOC: Small whatsnew cleanups (#26643) --- doc/source/whatsnew/v0.25.0.rst | 65 +++++++++++++++++---------------- 1 file changed, 33 insertions(+), 32 deletions(-) diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index 0e8cd95084a8d..267e34efc946f 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -72,7 +72,7 @@ Other Enhancements - :meth:`DataFrame.pivot_table` now accepts an ``observed`` parameter which is passed to underlying calls to :meth:`DataFrame.groupby` to speed up grouping categorical data. (:issue:`24923`) - ``Series.str`` has gained :meth:`Series.str.casefold` method to removes all case distinctions present in a string (:issue:`25405`) - :meth:`DataFrame.set_index` now works for instances of ``abc.Iterator``, provided their output is of the same length as the calling frame (:issue:`22484`, :issue:`24984`) -- :meth:`DatetimeIndex.union` now supports the ``sort`` argument. The behaviour of the sort parameter matches that of :meth:`Index.union` (:issue:`24994`) +- :meth:`DatetimeIndex.union` now supports the ``sort`` argument. The behavior of the sort parameter matches that of :meth:`Index.union` (:issue:`24994`) - :meth:`RangeIndex.union` now supports the ``sort`` argument. If ``sort=False`` an unsorted ``Int64Index`` is always returned. ``sort=None`` is the default and returns a mononotically increasing ``RangeIndex`` if possible or a sorted ``Int64Index`` if not (:issue:`24471`) - :meth:`TimedeltaIndex.intersection` now also supports the ``sort`` keyword (:issue:`24471`) - :meth:`DataFrame.rename` now supports the ``errors`` argument to raise errors when attempting to rename nonexistent keys (:issue:`13473`) @@ -123,11 +123,11 @@ is respected in indexing. (:issue:`24076`, :issue:`16785`) .. _whatsnew_0250.api_breaking.multi_indexing: -MultiIndex constructed from levels and codes -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +``MultiIndex`` constructed from levels and codes +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -Constructing a :class:`MultiIndex` with NaN levels or codes value < -1 was allowed previously. -Now, construction with codes value < -1 is not allowed and NaN levels' corresponding codes +Constructing a :class:`MultiIndex` with ``NaN`` levels or codes value < -1 was allowed previously. +Now, construction with codes value < -1 is not allowed and ``NaN`` levels' corresponding codes would be reassigned as -1. (:issue:`19387`) .. ipython:: python @@ -157,8 +157,8 @@ would be reassigned as -1. (:issue:`19387`) .. _whatsnew_0250.api_breaking.groupby_apply_first_group_once: -GroupBy.apply on ``DataFrame`` evaluates first group only once -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +``GroupBy.apply`` on ``DataFrame`` evaluates first group only once +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ The implementation of :meth:`DataFrameGroupBy.apply() ` previously evaluated the supplied function consistently twice on the first group @@ -176,7 +176,7 @@ Now every group is evaluated only a single time. print(group.name) return group -*Previous Behaviour*: +*Previous Behavior*: .. code-block:: python @@ -189,7 +189,7 @@ Now every group is evaluated only a single time. 0 x 1 1 y 2 -*New Behaviour*: +*New Behavior*: .. ipython:: python @@ -239,7 +239,7 @@ of ``object`` dtype. :attr:`Series.str` will now infer the dtype data *within* t ``'bytes'``-only data will raise an exception (except for :meth:`Series.str.decode`, :meth:`Series.str.get`, :meth:`Series.str.len`, :meth:`Series.str.slice`), see :issue:`23163`, :issue:`23011`, :issue:`23551`. -*Previous Behaviour*: +*Previous Behavior*: .. code-block:: python @@ -259,7 +259,7 @@ of ``object`` dtype. :attr:`Series.str` will now infer the dtype data *within* t 2 False dtype: bool -*New Behaviour*: +*New Behavior*: .. ipython:: python :okexcept: @@ -282,6 +282,8 @@ considered commutative, such that ``A.union(B) == B.union(A)`` (:issue:`23525`). *Previous Behavior*: +.. code-block:: python + In [1]: pd.period_range('19910905', periods=2).union(pd.Int64Index([1, 2, 3])) ... ValueError: can only call with other PeriodIndex-ed objects @@ -310,7 +312,7 @@ are returned. (:issue:`21521`) df = pd.DataFrame({"a": ["x", "y"], "b": [1, 2]}) df -*Previous Behaviour*: +*Previous Behavior*: .. code-block:: python @@ -320,7 +322,7 @@ are returned. (:issue:`21521`) 0 x 1 1 y 2 -*New Behaviour*: +*New Behavior*: .. ipython:: python @@ -355,7 +357,7 @@ with :attr:`numpy.nan` in the case of an empty :class:`DataFrame` (:issue:`26397 df.describe() -``__str__`` methods now call ``__repr__`` rather than vica-versa +``__str__`` methods now call ``__repr__`` rather than vice versa ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ Pandas has until now mostly defined string representations in a Pandas objects's @@ -434,7 +436,7 @@ Other API Changes - The ``arg`` argument in :meth:`pandas.core.groupby.DataFrameGroupBy.agg` has been renamed to ``func`` (:issue:`26089`) - The ``arg`` argument in :meth:`pandas.core.window._Window.aggregate` has been renamed to ``func`` (:issue:`26372`) - Most Pandas classes had a ``__bytes__`` method, which was used for getting a python2-style bytestring representation of the object. This method has been removed as a part of dropping Python2 (:issue:`26447`) -- The `.str`-accessor has been disabled for 1-level :class:`MultiIndex`, use :meth:`MultiIndex.to_flat_index` if necessary (:issue:`23679`) +- The ``.str``-accessor has been disabled for 1-level :class:`MultiIndex`, use :meth:`MultiIndex.to_flat_index` if necessary (:issue:`23679`) - Removed support of gtk package for clipboards (:issue:`26563`) .. _whatsnew_0250.deprecations: @@ -468,7 +470,7 @@ The memory usage of the two approaches is identical. See :ref:`sparse.migration` Other Deprecations ^^^^^^^^^^^^^^^^^^ -- The deprecated ``.ix[]`` indexer now raises a more visible FutureWarning instead of DeprecationWarning (:issue:`26438`). +- The deprecated ``.ix[]`` indexer now raises a more visible ``FutureWarning`` instead of ``DeprecationWarning`` (:issue:`26438`). - Deprecated the ``units=M`` (months) and ``units=Y`` (year) parameters for ``units`` of :func:`pandas.to_timedelta`, :func:`pandas.Timedelta` and :func:`pandas.TimedeltaIndex` (:issue:`16344`) - The :attr:`SparseArray.values` attribute is deprecated. You can use ``np.asarray(...)`` or the :meth:`SparseArray.to_dense` method instead (:issue:`26421`). @@ -499,14 +501,13 @@ Performance Improvements - Improved performance when slicing :class:`RangeIndex` (:issue:`26565`) - Improved performance of :meth:`read_csv` by faster tokenizing and faster parsing of small float numbers (:issue:`25784`) - Improved performance of :meth:`read_csv` by faster parsing of N/A and boolean values (:issue:`25804`) -- Improved performance of :meth:`IntervalIndex.is_monotonic`, :meth:`IntervalIndex.is_monotonic_increasing` and :meth:`IntervalIndex.is_monotonic_decreasing` by removing conversion to :class:`MultiIndex` (:issue:`24813`) +- Improved performance of :attr:`IntervalIndex.is_monotonic`, :attr:`IntervalIndex.is_monotonic_increasing` and :attr:`IntervalIndex.is_monotonic_decreasing` by removing conversion to :class:`MultiIndex` (:issue:`24813`) - Improved performance of :meth:`DataFrame.to_csv` when writing datetime dtypes (:issue:`25708`) - Improved performance of :meth:`read_csv` by much faster parsing of ``MM/YYYY`` and ``DD/MM/YYYY`` datetime formats (:issue:`25922`) - Improved performance of nanops for dtypes that cannot store NaNs. Speedup is particularly prominent for :meth:`Series.all` and :meth:`Series.any` (:issue:`25070`) - Improved performance of :meth:`Series.map` for dictionary mappers on categorical series by mapping the categories instead of mapping all values (:issue:`23785`) -- Improved performance of :meth:`read_csv` by faster concatenating date columns without extra conversion to string for integer/float zero - and float NaN; by faster checking the string for the possibility of being a date (:issue:`25754`) -- Improved performance of :meth:`IntervalIndex.is_unique` by removing conversion to `MultiIndex` (:issue:`24813`) +- Improved performance of :meth:`read_csv` by faster concatenating date columns without extra conversion to string for integer/float zero and float ``NaN``; by faster checking the string for the possibility of being a date (:issue:`25754`) +- Improved performance of :attr:`IntervalIndex.is_unique` by removing conversion to ``MultiIndex`` (:issue:`24813`) .. _whatsnew_0250.bug_fixes: @@ -518,7 +519,7 @@ Categorical ^^^^^^^^^^^ - Bug in :func:`DataFrame.at` and :func:`Series.at` that would raise exception if the index was a :class:`CategoricalIndex` (:issue:`20629`) -- Fixed bug in comparison of ordered :class:`Categorical` that contained missing values with a scalar which sometimes incorrectly resulted in True (:issue:`26504`) +- Fixed bug in comparison of ordered :class:`Categorical` that contained missing values with a scalar which sometimes incorrectly resulted in ``True`` (:issue:`26504`) - Datetimelike @@ -570,7 +571,7 @@ Numeric Conversion ^^^^^^^^^^ -- Bug in :func:`DataFrame.astype()` when passing a dict of columns and types the `errors` parameter was ignored. (:issue:`25905`) +- Bug in :func:`DataFrame.astype()` when passing a dict of columns and types the ``errors`` parameter was ignored. (:issue:`25905`) - - @@ -597,7 +598,7 @@ Indexing - Bug in which :meth:`DataFrame.append` produced an erroneous warning indicating that a ``KeyError`` will be thrown in the future when the data to be appended contains new columns (:issue:`22252`). - Bug in which :meth:`DataFrame.to_csv` caused a segfault for a reindexed data frame, when the indices were single-level :class:`MultiIndex` (:issue:`26303`). - Fixed bug where assigning a :class:`arrays.PandasArray` to a :class:`pandas.core.frame.DataFrame` would raise error (:issue:`26390`) -- Allow keyword arguments for callable local reference used in the :method:`DataFrame.query` string (:issue:`26426`) +- Allow keyword arguments for callable local reference used in the :meth:`DataFrame.query` string (:issue:`26426`) Missing @@ -620,8 +621,8 @@ I/O - Fixed bug in missing text when using :meth:`to_clipboard` if copying utf-16 characters in Python 3 on Windows (:issue:`25040`) - Bug in :func:`read_json` for ``orient='table'`` when it tries to infer dtypes by default, which is not applicable as dtypes are already defined in the JSON schema (:issue:`21345`) - Bug in :func:`read_json` for ``orient='table'`` and float index, as it infers index dtype by default, which is not applicable because index dtype is already defined in the JSON schema (:issue:`25433`) -- Bug in :func:`read_json` for ``orient='table'`` and string of float column names, as it makes a column name type conversion to Timestamp, which is not applicable because column names are already defined in the JSON schema (:issue:`25435`) -- Bug in :func:`json_normalize` for ``errors='ignore'`` where missing values in the input data, were filled in resulting ``DataFrame`` with the string "nan" instead of ``numpy.nan`` (:issue:`25468`) +- Bug in :func:`read_json` for ``orient='table'`` and string of float column names, as it makes a column name type conversion to :class:`Timestamp`, which is not applicable because column names are already defined in the JSON schema (:issue:`25435`) +- Bug in :func:`json_normalize` for ``errors='ignore'`` where missing values in the input data, were filled in resulting ``DataFrame`` with the string ``"nan"`` instead of ``numpy.nan`` (:issue:`25468`) - :meth:`DataFrame.to_html` now raises ``TypeError`` when using an invalid type for the ``classes`` parameter instead of ``AsseertionError`` (:issue:`25608`) - Bug in :meth:`DataFrame.to_string` and :meth:`DataFrame.to_latex` that would lead to incorrect output when the ``header`` keyword is used (:issue:`16718`) - Bug in :func:`read_csv` not properly interpreting the UTF8 encoded filenames on Windows on Python 3.6+ (:issue:`15086`) @@ -644,7 +645,7 @@ Plotting - Fixed bug where :class:`api.extensions.ExtensionArray` could not be used in matplotlib plotting (:issue:`25587`) - Bug in an error message in :meth:`DataFrame.plot`. Improved the error message if non-numerics are passed to :meth:`DataFrame.plot` (:issue:`25481`) -- Bug in incorrect ticklabel positions when plotting an index that are non-numeric / non-datetime (:issue:`7612` :issue:`15912` :issue:`22334`) +- Bug in incorrect ticklabel positions when plotting an index that are non-numeric / non-datetime (:issue:`7612`, :issue:`15912`, :issue:`22334`) - Fixed bug causing plots of :class:`PeriodIndex` timeseries to fail if the frequency is a multiple of the frequency rule code (:issue:`14763`) - - @@ -655,7 +656,7 @@ Groupby/Resample/Rolling - Bug in :meth:`pandas.core.resample.Resampler.agg` with a timezone aware index where ``OverflowError`` would raise when passing a list of functions (:issue:`22660`) - Bug in :meth:`pandas.core.groupby.DataFrameGroupBy.nunique` in which the names of column levels were lost (:issue:`23222`) -- Bug in :func:`pandas.core.groupby.GroupBy.agg` when applying a aggregation function to timezone aware data (:issue:`23683`) +- Bug in :func:`pandas.core.groupby.GroupBy.agg` when applying an aggregation function to timezone aware data (:issue:`23683`) - Bug in :func:`pandas.core.groupby.GroupBy.first` and :func:`pandas.core.groupby.GroupBy.last` where timezone information would be dropped (:issue:`21603`) - Bug in :func:`pandas.core.groupby.GroupBy.size` when grouping only NA values (:issue:`23050`) - Bug in :func:`Series.groupby` where ``observed`` kwarg was previously ignored (:issue:`24880`) @@ -663,11 +664,11 @@ Groupby/Resample/Rolling - Ensured that ordering of outputs in ``groupby`` aggregation functions is consistent across all versions of Python (:issue:`25692`) - Ensured that result group order is correct when grouping on an ordered ``Categorical`` and specifying ``observed=True`` (:issue:`25871`, :issue:`25167`) - Bug in :meth:`pandas.core.window.Rolling.min` and :meth:`pandas.core.window.Rolling.max` that caused a memory leak (:issue:`25893`) -- Bug in :meth:`pandas.core.window.Rolling.count` and `pandas.core.window.Expanding.count` was previously ignoring the axis keyword (:issue:`13503`) +- Bug in :meth:`pandas.core.window.Rolling.count` and ``pandas.core.window.Expanding.count`` was previously ignoring the ``axis`` keyword (:issue:`13503`) - Bug in :meth:`pandas.core.groupby.GroupBy.idxmax` and :meth:`pandas.core.groupby.GroupBy.idxmin` with datetime column would return incorrect dtype (:issue:`25444`, :issue:`15306`) - Bug in :meth:`pandas.core.groupby.GroupBy.cumsum`, :meth:`pandas.core.groupby.GroupBy.cumprod`, :meth:`pandas.core.groupby.GroupBy.cummin` and :meth:`pandas.core.groupby.GroupBy.cummax` with categorical column having absent categories, would return incorrect result or segfault (:issue:`16771`) - Bug in :meth:`pandas.core.groupby.GroupBy.nth` where NA values in the grouping would return incorrect results (:issue:`26011`) -- Bug in :meth:`pandas.core.groupby.SeriesGroupBy.transform` where transforming an empty group would raise error (:issue:`26208`) +- Bug in :meth:`pandas.core.groupby.SeriesGroupBy.transform` where transforming an empty group would raise a ``ValueError`` (:issue:`26208`) - Bug in :meth:`pandas.core.frame.DataFrame.groupby` where passing a :class:`pandas.core.groupby.grouper.Grouper` would return incorrect groups when using the ``.groups`` accessor (:issue:`26326`) - Bug in :meth:`pandas.core.groupby.GroupBy.agg` where incorrect results are returned for uint64 columns. (:issue:`26310`) @@ -682,11 +683,11 @@ Reshaping - Bug in :func:`concat` where the resulting ``freq`` of two :class:`DatetimeIndex` with the same ``freq`` would be dropped (:issue:`3232`). - Bug in :func:`merge` where merging with equivalent Categorical dtypes was raising an error (:issue:`22501`) - bug in :class:`DataFrame` instantiating with a dict of iterators or generators (e.g. ``pd.DataFrame({'A': reversed(range(3))})``) raised an error (:issue:`26349`). -- bug in :class:`DataFrame` instantiating with a ``range`` (e.g. ``pd.DataFrame(range(3))``) raised an error (:issue:`26342`). +- Bug in :class:`DataFrame` instantiating with a ``range`` (e.g. ``pd.DataFrame(range(3))``) raised an error (:issue:`26342`). - Bug in :class:`DataFrame` constructor when passing non-empty tuples would cause a segmentation fault (:issue:`25691`) - Bug in :func:`Series.apply` failed when the series is a timezone aware :class:`DatetimeIndex` (:issue:`25959`) - Bug in :func:`pandas.cut` where large bins could incorrectly raise an error due to an integer overflow (:issue:`26045`) -- Bug in :func:`DataFrame.sort_index` where an error is thrown when a multi-indexed DataFrame is sorted on all levels with the initial level sorted last (:issue:`26053`) +- Bug in :func:`DataFrame.sort_index` where an error is thrown when a multi-indexed ``DataFrame`` is sorted on all levels with the initial level sorted last (:issue:`26053`) - Bug in :meth:`Series.nlargest` treats ``True`` as smaller than ``False`` (:issue:`26154`) Sparse @@ -702,7 +703,7 @@ Other - Removed unused C functions from vendored UltraJSON implementation (:issue:`26198`) - Bug in :func:`factorize` when passing an ``ExtensionArray`` with a custom ``na_sentinel`` (:issue:`25696`). -- Allow :class:`Index` and :class:`RangeIndex` to be passed to numpy ``min`` and ``max`` functions. +- Allow :class:`Index` and :class:`RangeIndex` to be passed to numpy ``min`` and ``max`` functions (:issue:`26125`) .. _whatsnew_0.250.contributors: From c07d71d13b21e0b6e22146f0f546f1f8e24a64b3 Mon Sep 17 00:00:00 2001 From: Marc Garcia Date: Tue, 4 Jun 2019 12:23:42 +0100 Subject: [PATCH 39/51] DOC/CI: Removing Panel specific code from validate_docstrings.py (#26627) --- scripts/validate_docstrings.py | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/scripts/validate_docstrings.py b/scripts/validate_docstrings.py index 63db50db45a7c..64eaf45376b2f 100755 --- a/scripts/validate_docstrings.py +++ b/scripts/validate_docstrings.py @@ -539,14 +539,9 @@ def first_line_ends_in_dot(self): if self.doc: return self.doc.split('\n')[0][-1] == '.' - @property - def deprecated_with_directive(self): - return '.. deprecated:: ' in (self.summary + self.extended_summary) - @property def deprecated(self): - return (self.name.startswith('pandas.Panel') - or self.deprecated_with_directive) + return '.. deprecated:: ' in (self.summary + self.extended_summary) @property def mentioned_private_classes(self): @@ -674,7 +669,7 @@ def get_validation_data(doc): errs.append(error('GL07', correct_sections=', '.join(correct_order))) - if (doc.deprecated_with_directive + if (doc.deprecated and not doc.extended_summary.startswith('.. deprecated:: ')): errs.append(error('GL09')) @@ -859,9 +854,9 @@ def validate_all(prefix, ignore_deprecated=False): seen[shared_code_key] = func_name - # functions from introspecting Series, DataFrame and Panel + # functions from introspecting Series and DataFrame api_item_names = set(list(zip(*api_items))[0]) - for class_ in (pandas.Series, pandas.DataFrame, pandas.Panel): + for class_ in (pandas.Series, pandas.DataFrame): for member in inspect.getmembers(class_): func_name = 'pandas.{}.{}'.format(class_.__name__, member[0]) if (not member[0].startswith('_') From e25fd0d8ab10d6cc4dfe0f5808976f7921512c9f Mon Sep 17 00:00:00 2001 From: topper-123 Date: Tue, 4 Jun 2019 23:59:01 +0000 Subject: [PATCH 40/51] Remove NDFrame.select (#26641) --- doc/source/reference/frame.rst | 1 - doc/source/reference/series.rst | 1 - doc/source/whatsnew/v0.25.0.rst | 1 + pandas/core/generic.py | 34 ------------------ .../tests/frame/test_axis_select_reindex.py | 35 ------------------- pandas/tests/series/indexing/test_indexing.py | 14 -------- 6 files changed, 1 insertion(+), 85 deletions(-) diff --git a/doc/source/reference/frame.rst b/doc/source/reference/frame.rst index b4fb85c028b3e..7d5cd5d245631 100644 --- a/doc/source/reference/frame.rst +++ b/doc/source/reference/frame.rst @@ -204,7 +204,6 @@ Reindexing / Selection / Label manipulation DataFrame.rename_axis DataFrame.reset_index DataFrame.sample - DataFrame.select DataFrame.set_axis DataFrame.set_index DataFrame.tail diff --git a/doc/source/reference/series.rst b/doc/source/reference/series.rst index 8fccdea979602..79beeb0022307 100644 --- a/doc/source/reference/series.rst +++ b/doc/source/reference/series.rst @@ -211,7 +211,6 @@ Reindexing / Selection / Label manipulation Series.rename_axis Series.reset_index Series.sample - Series.select Series.set_axis Series.take Series.tail diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index 267e34efc946f..4e8af90b85f83 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -487,6 +487,7 @@ Removal of prior version deprecations/changes - Removed the previously deprecated ``parse_cols`` keyword in :func:`read_excel` (:issue:`16488`) - Removed the previously deprecated ``pd.options.html.border`` (:issue:`16970`) - Removed the previously deprecated ``convert_objects`` (:issue:`11221`) +- Removed the previously deprecated ``select`` method of ``DataFrame`` and ``Series`` (:issue:`17633`) .. _whatsnew_0250.performance: diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 2428bbad7003b..19d093dd29457 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -3682,40 +3682,6 @@ class animal locomotion _xs = xs # type: Callable - def select(self, crit, axis=0): - """ - Return data corresponding to axis labels matching criteria. - - .. deprecated:: 0.21.0 - Use df.loc[df.index.map(crit)] to select via labels - - Parameters - ---------- - crit : function - To be called on each index (label). Should return True or False - axis : int - - Returns - ------- - selection : same type as caller - """ - warnings.warn("'select' is deprecated and will be removed in a " - "future release. You can use " - ".loc[labels.map(crit)] as a replacement", - FutureWarning, stacklevel=2) - - axis = self._get_axis_number(axis) - axis_name = self._get_axis_name(axis) - axis_values = self._get_axis(axis) - - if len(axis_values) > 0: - new_axis = axis_values[ - np.asarray([bool(crit(label)) for label in axis_values])] - else: - new_axis = axis_values - - return self.reindex(**{axis_name: new_axis}) - def reindex_like(self, other, method=None, copy=True, limit=None, tolerance=None): """ diff --git a/pandas/tests/frame/test_axis_select_reindex.py b/pandas/tests/frame/test_axis_select_reindex.py index ad6c66c911615..42f98d5c96aa5 100644 --- a/pandas/tests/frame/test_axis_select_reindex.py +++ b/pandas/tests/frame/test_axis_select_reindex.py @@ -895,41 +895,6 @@ def test_filter_corner(self): result = empty.filter(like='foo') assert_frame_equal(result, empty) - def test_select(self): - - # deprecated: gh-12410 - f = lambda x: x.weekday() == 2 - index = self.tsframe.index[[f(x) for x in self.tsframe.index]] - expected_weekdays = self.tsframe.reindex(index=index) - - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False): - result = self.tsframe.select(f, axis=0) - assert_frame_equal(result, expected_weekdays) - - result = self.frame.select(lambda x: x in ('B', 'D'), axis=1) - expected = self.frame.reindex(columns=['B', 'D']) - assert_frame_equal(result, expected, check_names=False) - - # replacement - f = lambda x: x.weekday == 2 - result = self.tsframe.loc(axis=0)[f(self.tsframe.index)] - assert_frame_equal(result, expected_weekdays) - - crit = lambda x: x in ['B', 'D'] - result = self.frame.loc(axis=1)[(self.frame.columns.map(crit))] - expected = self.frame.reindex(columns=['B', 'D']) - assert_frame_equal(result, expected, check_names=False) - - # doc example - df = DataFrame({'A': [1, 2, 3]}, index=['foo', 'bar', 'baz']) - - crit = lambda x: x in ['bar', 'baz'] - with tm.assert_produces_warning(FutureWarning): - expected = df.select(crit) - result = df.loc[df.index.map(crit)] - assert_frame_equal(result, expected, check_names=False) - def test_take(self): # homogeneous order = [3, 1, 2, 0] diff --git a/pandas/tests/series/indexing/test_indexing.py b/pandas/tests/series/indexing/test_indexing.py index 6641311faace2..702e22b6741e4 100644 --- a/pandas/tests/series/indexing/test_indexing.py +++ b/pandas/tests/series/indexing/test_indexing.py @@ -772,20 +772,6 @@ def test_setitem_slice_into_readonly_backing_data(): """ -def test_select(test_data): - # deprecated: gh-12410 - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False): - n = len(test_data.ts) - result = test_data.ts.select(lambda x: x >= test_data.ts.index[n // 2]) - expected = test_data.ts.reindex(test_data.ts.index[n // 2:]) - assert_series_equal(result, expected) - - result = test_data.ts.select(lambda x: x.weekday() == 2) - expected = test_data.ts[test_data.ts.index.weekday == 2] - assert_series_equal(result, expected) - - def test_pop(): # GH 6600 df = DataFrame({'A': 0, 'B': np.arange(5, dtype='int64'), 'C': 0, }) From 01d97d48b08c546a46b91c27a5886f52b46f22c2 Mon Sep 17 00:00:00 2001 From: Mak Sze Chun Date: Wed, 5 Jun 2019 15:22:08 +0800 Subject: [PATCH 41/51] [TST] Fix test_quantile_interpolation_int (#26633) --- pandas/tests/frame/test_quantile.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pandas/tests/frame/test_quantile.py b/pandas/tests/frame/test_quantile.py index 9ccbd290923ba..097477c42d249 100644 --- a/pandas/tests/frame/test_quantile.py +++ b/pandas/tests/frame/test_quantile.py @@ -160,8 +160,7 @@ def test_quantile_interpolation_int(self, int_frame): assert q['A'] == np.percentile(df['A'], 10) # test with and without interpolation keyword - # TODO: q1 is not different from q - q1 = df.quantile(0.1) + q1 = df.quantile(0.1, axis=0, interpolation='linear') assert q1['A'] == np.percentile(df['A'], 10) tm.assert_series_equal(q, q1) From c57f206360108c327d8256e716080fb1a2523fd8 Mon Sep 17 00:00:00 2001 From: shawnbrown Date: Wed, 5 Jun 2019 07:53:40 -0400 Subject: [PATCH 42/51] Update Accessors URL for PdVega package. (#26653) See altair-viz/pdvega@7476a8a26b for details. --- doc/source/ecosystem.rst | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/doc/source/ecosystem.rst b/doc/source/ecosystem.rst index e232bd2157611..b1a5430752558 100644 --- a/doc/source/ecosystem.rst +++ b/doc/source/ecosystem.rst @@ -363,4 +363,5 @@ Library Accessor Classes ============== ========== ========================= .. _cyberpandas: https://cyberpandas.readthedocs.io/en/latest -.. _pdvega: https://jakevdp.github.io/pdvega/ +.. _pdvega: https://altair-viz.github.io/pdvega/ + From 758e35d7c8aa46279cbb9d6191ddb9842f1ce31b Mon Sep 17 00:00:00 2001 From: Marc Garcia Date: Wed, 5 Jun 2019 13:46:37 +0100 Subject: [PATCH 43/51] DEPS: Adding missing doc dependencies to environment.yml (#26657) --- environment.yml | 7 +++++++ requirements-dev.txt | 7 +++++++ 2 files changed, 14 insertions(+) diff --git a/environment.yml b/environment.yml index cf17dc1281ec9..91ea26eef4b61 100644 --- a/environment.yml +++ b/environment.yml @@ -17,10 +17,17 @@ dependencies: - flake8-rst>=0.6.0,<=0.7.0 - gitpython - hypothesis>=3.82 + - ipywidgets - isort - moto - mypy + - nbconvert>=5.4.1 + - nbformat + - notebook>=5.7.5 + - pandoc - pycodestyle + - pyqt + - python-snappy - pytest>=4.0.2 - pytest-mock - sphinx diff --git a/requirements-dev.txt b/requirements-dev.txt index 115a93495c95b..e6085920a9999 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -8,10 +8,17 @@ flake8-comprehensions flake8-rst>=0.6.0,<=0.7.0 gitpython hypothesis>=3.82 +ipywidgets isort moto mypy +nbconvert>=5.4.1 +nbformat +notebook>=5.7.5 +pandoc pycodestyle +pyqt +python-snappy pytest>=4.0.2 pytest-mock sphinx From 6ce7fc70a0103aaf8d6d6ff908a61b561447c218 Mon Sep 17 00:00:00 2001 From: topper-123 Date: Wed, 5 Jun 2019 12:50:33 +0000 Subject: [PATCH 44/51] use range in RangeIndex instead of _start etc. (#26581) --- doc/source/whatsnew/v0.25.0.rst | 3 + pandas/core/dtypes/common.py | 29 +++ pandas/core/dtypes/concat.py | 21 +- pandas/core/frame.py | 10 +- pandas/core/indexes/range.py | 304 +++++++++++++---------------- pandas/core/series.py | 6 +- pandas/io/packers.py | 7 +- pandas/tests/indexes/test_range.py | 22 ++- 8 files changed, 202 insertions(+), 200 deletions(-) diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index 4e8af90b85f83..4018418294963 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -476,6 +476,9 @@ Other Deprecations the :meth:`SparseArray.to_dense` method instead (:issue:`26421`). - The functions :func:`pandas.to_datetime` and :func:`pandas.to_timedelta` have deprecated the ``box`` keyword. Instead, use :meth:`to_numpy` or :meth:`Timestamp.to_datetime64` or :meth:`Timedelta.to_timedelta64`. (:issue:`24416`) - The :meth:`DataFrame.compound` and :meth:`Series.compound` methods are deprecated and will be removed in a future version (:issue:`26405`). +- The internal attributes ``_start``, ``_stop`` and ``_step`` attributes of :class:`RangeIndex` have been deprecated. + Use the public attributes :attr:`~RangeIndex.start`, :attr:`~RangeIndex.stop` and :attr:`~RangeIndex.step` instead (:issue:`26581`). + .. _whatsnew_0250.prior_deprecations: diff --git a/pandas/core/dtypes/common.py b/pandas/core/dtypes/common.py index b5cd73a81962b..4029e6f4bfdb5 100644 --- a/pandas/core/dtypes/common.py +++ b/pandas/core/dtypes/common.py @@ -1,4 +1,5 @@ """ common type operations """ +from typing import Union import warnings import numpy as np @@ -125,6 +126,34 @@ def ensure_int_or_float(arr: ArrayLike, copy=False) -> np.array: return arr.astype('float64', copy=copy) +def ensure_python_int(value: Union[int, np.integer]) -> int: + """ + Ensure that a value is a python int. + + Parameters + ---------- + value: int or numpy.integer + + Returns + ------- + int + + Raises + ------ + TypeError: if the value isn't an int or can't be converted to one. + """ + if not is_scalar(value): + raise TypeError("Value needs to be a scalar value, was type {}" + .format(type(value))) + msg = "Wrong type {} for value {}" + try: + new_value = int(value) + assert (new_value == value) + except (TypeError, ValueError, AssertionError): + raise TypeError(msg.format(type(value), value)) + return new_value + + def classes(*klasses): """ evaluate if the tipo is a subclass of the klasses """ return lambda tipo: issubclass(tipo, klasses) diff --git a/pandas/core/dtypes/concat.py b/pandas/core/dtypes/concat.py index b22ed45642cf6..e2c6fba322be0 100644 --- a/pandas/core/dtypes/concat.py +++ b/pandas/core/dtypes/concat.py @@ -541,36 +541,37 @@ def _concat_rangeindex_same_dtype(indexes): """ from pandas import Int64Index, RangeIndex - start = step = next = None + start = step = next_ = None # Filter the empty indexes non_empty_indexes = [obj for obj in indexes if len(obj)] for obj in non_empty_indexes: + rng = obj._range # type: range if start is None: # This is set by the first non-empty index - start = obj._start - if step is None and len(obj) > 1: - step = obj._step + start = rng.start + if step is None and len(rng) > 1: + step = rng.step elif step is None: # First non-empty index had only one element - if obj._start == start: + if rng.start == start: return _concat_index_same_dtype(indexes, klass=Int64Index) - step = obj._start - start + step = rng.start - start - non_consecutive = ((step != obj._step and len(obj) > 1) or - (next is not None and obj._start != next)) + non_consecutive = ((step != rng.step and len(rng) > 1) or + (next_ is not None and rng.start != next_)) if non_consecutive: return _concat_index_same_dtype(indexes, klass=Int64Index) if step is not None: - next = obj[-1] + step + next_ = rng[-1] + step if non_empty_indexes: # Get the stop value from "next" or alternatively # from the last non-empty index - stop = non_empty_indexes[-1]._stop if next is None else next + stop = non_empty_indexes[-1].stop if next_ is None else next_ return RangeIndex(start, stop, step) # Here all "indexes" had 0 length, i.e. were empty. diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 5957b23535350..48dfa57c47bf6 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -2282,7 +2282,7 @@ def info(self, verbose=None, buf=None, max_cols=None, memory_usage=None, text_col 5 non-null object float_col 5 non-null float64 dtypes: float64(1), int64(1), object(1) - memory usage: 200.0+ bytes + memory usage: 248.0+ bytes Prints a summary of columns count and its dtypes but not per column information: @@ -2292,7 +2292,7 @@ def info(self, verbose=None, buf=None, max_cols=None, memory_usage=None, RangeIndex: 5 entries, 0 to 4 Columns: 3 entries, int_col to float_col dtypes: float64(1), int64(1), object(1) - memory usage: 200.0+ bytes + memory usage: 248.0+ bytes Pipe output of DataFrame.info to buffer instead of sys.stdout, get buffer content and writes to a text file: @@ -2494,7 +2494,7 @@ def memory_usage(self, index=True, deep=False): 4 1 1.0 1.0+0.0j 1 True >>> df.memory_usage() - Index 80 + Index 128 int64 40000 float64 40000 complex128 80000 @@ -2513,7 +2513,7 @@ def memory_usage(self, index=True, deep=False): The memory footprint of `object` dtype columns is ignored by default: >>> df.memory_usage(deep=True) - Index 80 + Index 128 int64 40000 float64 40000 complex128 80000 @@ -2525,7 +2525,7 @@ def memory_usage(self, index=True, deep=False): many repeated values. >>> df['object'].astype('category').memory_usage(deep=True) - 5168 + 5216 """ result = Series([c.memory_usage(index=False, deep=deep) for col, c in self.iteritems()], index=self.columns) diff --git a/pandas/core/indexes/range.py b/pandas/core/indexes/range.py index 9401de3346ccd..82fd7342c027c 100644 --- a/pandas/core/indexes/range.py +++ b/pandas/core/indexes/range.py @@ -12,7 +12,8 @@ from pandas.core.dtypes import concat as _concat from pandas.core.dtypes.common import ( - is_int64_dtype, is_integer, is_scalar, is_timedelta64_dtype) + ensure_python_int, is_int64_dtype, is_integer, is_scalar, + is_timedelta64_dtype) from pandas.core.dtypes.generic import ( ABCDataFrame, ABCSeries, ABCTimedeltaIndex) @@ -65,6 +66,7 @@ class RangeIndex(Int64Index): _typ = 'rangeindex' _engine_type = libindex.Int64Engine + _range = None # type: range # check whether self._data has benn called _cached_data = None # type: np.ndarray @@ -91,39 +93,19 @@ def __new__(cls, start=None, stop=None, step=None, **dict(start._get_data_as_items())) # validate the arguments - def ensure_int(value, field): - msg = ("RangeIndex(...) must be called with integers," - " {value} was passed for {field}") - if not is_scalar(value): - raise TypeError(msg.format(value=type(value).__name__, - field=field)) - try: - new_value = int(value) - assert(new_value == value) - except (TypeError, ValueError, AssertionError): - raise TypeError(msg.format(value=type(value).__name__, - field=field)) + if com._all_none(start, stop, step): + raise TypeError("RangeIndex(...) must be called with integers") - return new_value + start = ensure_python_int(start) if start is not None else 0 - if com._all_none(start, stop, step): - msg = "RangeIndex(...) must be called with integers" - raise TypeError(msg) - elif start is None: - start = 0 - else: - start = ensure_int(start, 'start') if stop is None: - stop = start - start = 0 + start, stop = 0, start else: - stop = ensure_int(stop, 'stop') - if step is None: - step = 1 - elif step == 0: + stop = ensure_python_int(stop) + + step = ensure_python_int(step) if step is not None else 1 + if step == 0: raise ValueError("Step must not be zero") - else: - step = ensure_int(step, 'step') return cls._simple_new(start, stop, step, name) @@ -142,7 +124,7 @@ def from_range(cls, data, name=None, dtype=None, **kwargs): 'range, {1} was passed'.format(cls.__name__, repr(data))) start, stop, step = data.start, data.stop, data.step - return RangeIndex(start, stop, step, dtype=dtype, name=name, **kwargs) + return cls(start, stop, step, dtype=dtype, name=name, **kwargs) @classmethod def _simple_new(cls, start, stop=None, step=None, name=None, @@ -156,20 +138,16 @@ def _simple_new(cls, start, stop=None, step=None, name=None, if start is None or not is_integer(start): try: - - return RangeIndex(start, stop, step, name=name, **kwargs) + return cls(start, stop, step, name=name, **kwargs) except TypeError: return Index(start, stop, step, name=name, **kwargs) - result._start = start - result._stop = stop or 0 - result._step = step or 1 + result._range = range(start, stop or 0, step or 1) + result.name = name for k, v in kwargs.items(): setattr(result, k, v) - result._range = range(result._start, result._stop, result._step) - result._reset_identity() return result @@ -196,7 +174,7 @@ def _data(self): triggering the construction. """ if self._cached_data is None: - self._cached_data = np.arange(self._start, self._stop, self._step, + self._cached_data = np.arange(self.start, self.stop, self.step, dtype=np.int64) return self._cached_data @@ -206,9 +184,10 @@ def _int64index(self): def _get_data_as_items(self): """ return a list of tuples of start, stop, step """ - return [('start', self._start), - ('stop', self._stop), - ('step', self._step)] + rng = self._range + return [('start', rng.start), + ('stop', rng.stop), + ('step', rng.step)] def __reduce__(self): d = self._get_attributes_dict() @@ -235,39 +214,79 @@ def _format_with_header(self, header, na_rep='NaN', **kwargs): return header + list(map(pprint_thing, self._range)) # -------------------------------------------------------------------- - @property + _deprecation_message = ("RangeIndex.{} is deprecated and will be " + "removed in a future version. Use RangeIndex.{} " + "instead") + + @cache_readonly def start(self): """ - The value of the `start` parameter (or ``0`` if this was not supplied) + The value of the `start` parameter (``0`` if this was not supplied) """ # GH 25710 - return self._start + return self._range.start @property + def _start(self): + """ + The value of the `start` parameter (``0`` if this was not supplied) + + .. deprecated:: 0.25.0 + Use ``start`` instead. + """ + warnings.warn(self._deprecation_message.format("_start", "start"), + DeprecationWarning, stacklevel=2) + return self.start + + @cache_readonly def stop(self): """ The value of the `stop` parameter """ - # GH 25710 - return self._stop + return self._range.stop @property + def _stop(self): + """ + The value of the `stop` parameter + + .. deprecated:: 0.25.0 + Use ``stop`` instead. + """ + # GH 25710 + warnings.warn(self._deprecation_message.format("_stop", "stop"), + DeprecationWarning, stacklevel=2) + return self.stop + + @cache_readonly def step(self): """ - The value of the `step` parameter (or ``1`` if this was not supplied) + The value of the `step` parameter (``1`` if this was not supplied) """ # GH 25710 - return self._step + return self._range.step + + @property + def _step(self): + """ + The value of the `step` parameter (``1`` if this was not supplied) + + .. deprecated:: 0.25.0 + Use ``step`` instead. + """ + # GH 25710 + warnings.warn(self._deprecation_message.format("_step", "step"), + DeprecationWarning, stacklevel=2) + return self.step @cache_readonly def nbytes(self): """ - Return the number of bytes in the underlying data - On implementations where this is undetermined (PyPy) - assume 24 bytes for each value + Return the number of bytes in the underlying data. """ - return sum(getsizeof(getattr(self, v), 24) for v in - ['_start', '_stop', '_step']) + rng = self._range + return getsizeof(rng) + sum(getsizeof(getattr(rng, attr_name)) + for attr_name in ['start', 'stop', 'step']) def memory_usage(self, deep=False): """ @@ -305,11 +324,11 @@ def is_unique(self): @cache_readonly def is_monotonic_increasing(self): - return self._step > 0 or len(self) <= 1 + return self._range.step > 0 or len(self) <= 1 @cache_readonly def is_monotonic_decreasing(self): - return self._step < 0 or len(self) <= 1 + return self._range.step < 0 or len(self) <= 1 @property def has_duplicates(self): @@ -325,13 +344,13 @@ def get_loc(self, key, method=None, tolerance=None): return super().get_loc(key, method=method, tolerance=tolerance) def tolist(self): - return list(range(self._start, self._stop, self._step)) + return list(self._range) @Appender(_index_shared_docs['_shallow_copy']) def _shallow_copy(self, values=None, **kwargs): if values is None: name = kwargs.get("name", self.name) - return RangeIndex._simple_new( + return self._simple_new( name=name, **dict(self._get_data_as_items())) else: kwargs.setdefault('name', self.name) @@ -342,18 +361,17 @@ def copy(self, name=None, deep=False, dtype=None, **kwargs): self._validate_dtype(dtype) if name is None: name = self.name - return RangeIndex._simple_new( - name=name, **dict(self._get_data_as_items())) + return self.from_range(self._range, name=name) def _minmax(self, meth): no_steps = len(self) - 1 if no_steps == -1: return np.nan - elif ((meth == 'min' and self._step > 0) or - (meth == 'max' and self._step < 0)): - return self._start + elif ((meth == 'min' and self.step > 0) or + (meth == 'max' and self.step < 0)): + return self.start - return self._start + self._step * no_steps + return self.start + self.step * no_steps def min(self, axis=None, skipna=True, *args, **kwargs): """The minimum value of the RangeIndex""" @@ -382,7 +400,7 @@ def argsort(self, *args, **kwargs): """ nv.validate_argsort(args, kwargs) - if self._step > 0: + if self._range.step > 0: return np.arange(len(self)) else: return np.arange(len(self) - 1, -1, -1) @@ -392,15 +410,7 @@ def equals(self, other): Determines if two Index objects contain the same elements. """ if isinstance(other, RangeIndex): - ls = len(self) - lo = len(other) - return (ls == lo == 0 or - ls == lo == 1 and - self._start == other._start or - ls == lo and - self._start == other._start and - self._step == other._step) - + return self._range == other._range return super().equals(other) def intersection(self, other, sort=False): @@ -433,39 +443,40 @@ def intersection(self, other, sort=False): return super().intersection(other, sort=sort) if not len(self) or not len(other): - return RangeIndex._simple_new(None) + return self._simple_new(None) - first = self[::-1] if self._step < 0 else self - second = other[::-1] if other._step < 0 else other + first = self._range[::-1] if self.step < 0 else self._range + second = other._range[::-1] if other.step < 0 else other._range # check whether intervals intersect # deals with in- and decreasing ranges - int_low = max(first._start, second._start) - int_high = min(first._stop, second._stop) + int_low = max(first.start, second.start) + int_high = min(first.stop, second.stop) if int_high <= int_low: - return RangeIndex._simple_new(None) + return self._simple_new(None) # Method hint: linear Diophantine equation # solve intersection problem # performance hint: for identical step sizes, could use # cheaper alternative - gcd, s, t = first._extended_gcd(first._step, second._step) + gcd, s, t = self._extended_gcd(first.step, second.step) # check whether element sets intersect - if (first._start - second._start) % gcd: - return RangeIndex._simple_new(None) + if (first.start - second.start) % gcd: + return self._simple_new(None) # calculate parameters for the RangeIndex describing the # intersection disregarding the lower bounds - tmp_start = first._start + (second._start - first._start) * \ - first._step // gcd * s - new_step = first._step * second._step // gcd - new_index = RangeIndex._simple_new(tmp_start, int_high, new_step) + tmp_start = first.start + (second.start - first.start) * \ + first.step // gcd * s + new_step = first.step * second.step // gcd + new_index = self._simple_new(tmp_start, int_high, new_step) # adjust index to limiting interval - new_index._start = new_index._min_fitting_element(int_low) + new_start = new_index._min_fitting_element(int_low) + new_index = self._simple_new(new_start, new_index.stop, new_index.step) - if (self._step < 0 and other._step < 0) is not (new_index._step < 0): + if (self.step < 0 and other.step < 0) is not (new_index.step < 0): new_index = new_index[::-1] if sort is None: new_index = new_index.sort_values() @@ -473,13 +484,13 @@ def intersection(self, other, sort=False): def _min_fitting_element(self, lower_limit): """Returns the smallest element greater than or equal to the limit""" - no_steps = -(-(lower_limit - self._start) // abs(self._step)) - return self._start + abs(self._step) * no_steps + no_steps = -(-(lower_limit - self.start) // abs(self.step)) + return self.start + abs(self.step) * no_steps def _max_fitting_element(self, upper_limit): """Returns the largest element smaller than or equal to the limit""" - no_steps = (upper_limit - self._start) // abs(self._step) - return self._start + abs(self._step) * no_steps + no_steps = (upper_limit - self.start) // abs(self.step) + return self.start + abs(self.step) * no_steps def _extended_gcd(self, a, b): """ @@ -522,16 +533,16 @@ def _union(self, other, sort): return super()._union(other, sort=sort) if isinstance(other, RangeIndex) and sort is None: - start_s, step_s = self._start, self._step - end_s = self._start + self._step * (len(self) - 1) - start_o, step_o = other._start, other._step - end_o = other._start + other._step * (len(other) - 1) - if self._step < 0: + start_s, step_s = self.start, self.step + end_s = self.start + self.step * (len(self) - 1) + start_o, step_o = other.start, other.step + end_o = other.start + other.step * (len(other) - 1) + if self.step < 0: start_s, step_s, end_s = end_s, -step_s, start_s - if other._step < 0: + if other.step < 0: start_o, step_o, end_o = end_o, -step_o, start_o if len(self) == 1 and len(other) == 1: - step_s = step_o = abs(self._start - other._start) + step_s = step_o = abs(self.start - other.start) elif len(self) == 1: step_s = step_o elif len(other) == 1: @@ -542,21 +553,23 @@ def _union(self, other, sort): if ((start_s - start_o) % step_s == 0 and (start_s - end_o) <= step_s and (start_o - end_s) <= step_s): - return RangeIndex(start_r, end_r + step_s, step_s) + return self.__class__(start_r, end_r + step_s, step_s) if ((step_s % 2 == 0) and (abs(start_s - start_o) <= step_s / 2) and (abs(end_s - end_o) <= step_s / 2)): - return RangeIndex(start_r, end_r + step_s / 2, step_s / 2) + return self.__class__(start_r, + end_r + step_s / 2, + step_s / 2) elif step_o % step_s == 0: if ((start_o - start_s) % step_s == 0 and (start_o + step_s >= start_s) and (end_o - step_s <= end_s)): - return RangeIndex(start_r, end_r + step_s, step_s) + return self.__class__(start_r, end_r + step_s, step_s) elif step_s % step_o == 0: if ((start_s - start_o) % step_o == 0 and (start_s + step_o >= start_o) and (end_s - step_o <= end_o)): - return RangeIndex(start_r, end_r + step_o, step_o) + return self.__class__(start_r, end_r + step_o, step_o) return self._int64index._union(other, sort=sort) @Appender(_index_shared_docs['join']) @@ -576,7 +589,7 @@ def __len__(self): """ return the length of the RangeIndex """ - return max(0, -(-(self._stop - self._start) // self._step)) + return len(self._range) @property def size(self): @@ -597,59 +610,15 @@ def __getitem__(self, key): n = com.cast_scalar_indexer(key) if n != key: return super_getitem(key) - if n < 0: - n = len(self) + key - if n < 0 or n > len(self) - 1: + try: + return self._range[key] + except IndexError: raise IndexError("index {key} is out of bounds for axis 0 " "with size {size}".format(key=key, size=len(self))) - return self._start + n * self._step - if isinstance(key, slice): - - # This is basically PySlice_GetIndicesEx, but delegation to our - # super routines if we don't have integers - - length = len(self) - - # complete missing slice information - step = 1 if key.step is None else key.step - if key.start is None: - start = length - 1 if step < 0 else 0 - else: - start = key.start - - if start < 0: - start += length - if start < 0: - start = -1 if step < 0 else 0 - if start >= length: - start = length - 1 if step < 0 else length - - if key.stop is None: - stop = -1 if step < 0 else length - else: - stop = key.stop - - if stop < 0: - stop += length - if stop < 0: - stop = -1 - if stop > length: - stop = length - - # delegate non-integer slices - if (start != int(start) or - stop != int(stop) or - step != int(step)): - return super_getitem(key) - - # convert indexes to values - start = self._start + self._step * start - stop = self._start + self._step * stop - step = self._step * step - - return RangeIndex._simple_new(start, stop, step, name=self.name) + new_range = self._range[key] + return self.from_range(new_range, name=self.name) # fall back to Int64Index return super_getitem(key) @@ -660,17 +629,15 @@ def __floordiv__(self, other): if is_integer(other) and other != 0: if (len(self) == 0 or - self._start % other == 0 and - self._step % other == 0): - start = self._start // other - step = self._step // other + self.start % other == 0 and + self.step % other == 0): + start = self.start // other + step = self.step // other stop = start + len(self) * step - return RangeIndex._simple_new( - start, stop, step, name=self.name) + return self._simple_new(start, stop, step, name=self.name) if len(self) == 1: - start = self._start // other - return RangeIndex._simple_new( - start, start + 1, 1, name=self.name) + start = self.start // other + return self._simple_new(start, start + 1, 1, name=self.name) return self._int64index // other @classmethod @@ -712,7 +679,7 @@ def _evaluate_numeric_binop(self, other): # apply if we have an override if step: with np.errstate(all='ignore'): - rstep = step(left._step, right) + rstep = step(left.step, right) # we don't have a representable op # so return a base index @@ -720,16 +687,13 @@ def _evaluate_numeric_binop(self, other): raise ValueError else: - rstep = left._step + rstep = left.step with np.errstate(all='ignore'): - rstart = op(left._start, right) - rstop = op(left._stop, right) + rstart = op(left.start, right) + rstop = op(left.stop, right) - result = RangeIndex(rstart, - rstop, - rstep, - **attrs) + result = self.__class__(rstart, rstop, rstep, **attrs) # for compat with numpy / Int64Index # even if we can represent as a RangeIndex, return diff --git a/pandas/core/series.py b/pandas/core/series.py index 8fb6ad3e3ccc5..472d984234275 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -4010,7 +4010,7 @@ def memory_usage(self, index=True, deep=False): -------- >>> s = pd.Series(range(3)) >>> s.memory_usage() - 104 + 152 Not including the index gives the size of the rest of the data, which is necessarily smaller: @@ -4024,9 +4024,9 @@ def memory_usage(self, index=True, deep=False): >>> s.values array(['a', 'b'], dtype=object) >>> s.memory_usage() - 96 + 144 >>> s.memory_usage(deep=True) - 212 + 260 """ v = super().memory_usage(deep=deep) if index: diff --git a/pandas/io/packers.py b/pandas/io/packers.py index 1309bd1fef421..ead0fbd263ebf 100644 --- a/pandas/io/packers.py +++ b/pandas/io/packers.py @@ -367,9 +367,10 @@ def encode(obj): return {'typ': 'range_index', 'klass': obj.__class__.__name__, 'name': getattr(obj, 'name', None), - 'start': getattr(obj, '_start', None), - 'stop': getattr(obj, '_stop', None), - 'step': getattr(obj, '_step', None)} + 'start': obj._range.start, + 'stop': obj._range.stop, + 'step': obj._range.step, + } elif isinstance(obj, PeriodIndex): return {'typ': 'period_index', 'klass': obj.__class__.__name__, diff --git a/pandas/tests/indexes/test_range.py b/pandas/tests/indexes/test_range.py index 477a4e527f278..bca50186827de 100644 --- a/pandas/tests/indexes/test_range.py +++ b/pandas/tests/indexes/test_range.py @@ -51,10 +51,8 @@ def test_constructor(self, args, kwargs, start, stop, step, name): expected = Index(np.arange(start, stop, step, dtype=np.int64), name=name) assert isinstance(result, RangeIndex) - assert result._start == start - assert result._stop == stop - assert result._step == step assert result.name is name + assert result._range == range(start, stop, step) tm.assert_index_equal(result, expected) def test_constructor_invalid_args(self): @@ -169,14 +167,19 @@ def test_start_stop_step_attrs(self, index, start, stop, step): assert index.stop == stop assert index.step == step + def test_deprecated_start_stop_step_attrs(self): + # GH 26581 + idx = self.create_index() + for attr_name in ['_start', '_stop', '_step']: + with tm.assert_produces_warning(DeprecationWarning): + getattr(idx, attr_name) + def test_copy(self): i = RangeIndex(5, name='Foo') i_copy = i.copy() assert i_copy is not i assert i_copy.identical(i) - assert i_copy._start == 0 - assert i_copy._stop == 5 - assert i_copy._step == 1 + assert i_copy._range == range(0, 5, 1) assert i_copy.name == 'Foo' def test_repr(self): @@ -243,8 +246,9 @@ def test_dtype(self): def test_cached_data(self): # GH 26565 - # Calling RangeIndex._data caches an int64 array of the same length at - # self._cached_data. This tests whether _cached_data has been set. + # Calling RangeIndex._data caches an int64 array of the same length as + # self at self._cached_data. + # This tests whether _cached_data is being set by various operations. idx = RangeIndex(0, 100, 10) assert idx._cached_data is None @@ -273,7 +277,7 @@ def test_cached_data(self): df.iloc[5:10] assert idx._cached_data is None - # actually calling data._data + # actually calling idx._data assert isinstance(idx._data, np.ndarray) assert isinstance(idx._cached_data, np.ndarray) From 8ef9a6356f9f00e22908dd04aa47b2a5d6c38725 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Matias=20Heikkil=C3=A4?= <42970828+mahepe@users.noreply.github.com> Date: Wed, 5 Jun 2019 15:54:34 +0300 Subject: [PATCH 45/51] TST: Test sorting levels not aligned with index (#25775) (#26492) --- pandas/tests/frame/test_sorting.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/pandas/tests/frame/test_sorting.py b/pandas/tests/frame/test_sorting.py index 246ba943a4509..96aeb608ba3b8 100644 --- a/pandas/tests/frame/test_sorting.py +++ b/pandas/tests/frame/test_sorting.py @@ -227,6 +227,18 @@ def test_stable_descending_multicolumn_sort(self): kind='mergesort') assert_frame_equal(sorted_df, expected) + def test_sort_multi_index(self): + # GH 25775, testing that sorting by index works with a multi-index. + df = DataFrame({'a': [3, 1, 2], 'b': [0, 0, 0], + 'c': [0, 1, 2], 'd': list('abc')}) + result = df.set_index(list('abc')).sort_index(level=list('ba')) + + expected = DataFrame({'a': [1, 2, 3], 'b': [0, 0, 0], + 'c': [1, 2, 0], 'd': list('bca')}) + expected = expected.set_index(list('abc')) + + tm.assert_frame_equal(result, expected) + def test_stable_categorial(self): # GH 16793 df = DataFrame({ From e0c41f79104c5bc61952c9a14f1883cd5bda53f7 Mon Sep 17 00:00:00 2001 From: William Ayd Date: Wed, 5 Jun 2019 08:59:12 -0400 Subject: [PATCH 46/51] Remove SharedItems from test_excel (#26579) --- pandas/tests/io/test_excel.py | 332 +++++++++++++++++----------------- 1 file changed, 169 insertions(+), 163 deletions(-) diff --git a/pandas/tests/io/test_excel.py b/pandas/tests/io/test_excel.py index 7693caf3b31d2..b99f0336fa4c5 100644 --- a/pandas/tests/io/test_excel.py +++ b/pandas/tests/io/test_excel.py @@ -26,13 +26,22 @@ from pandas.io.formats.excel import ExcelFormatter from pandas.io.parsers import read_csv -_seriesd = tm.getSeriesData() -_tsd = tm.getTimeSeriesData() -_frame = DataFrame(_seriesd)[:10] -_frame2 = DataFrame(_seriesd, columns=['D', 'C', 'B', 'A'])[:10] -_tsframe = tm.makeTimeDataFrame()[:5] -_mixed_frame = _frame.copy() -_mixed_frame['foo'] = 'bar' + +@pytest.fixture +def frame(float_frame): + return float_frame[:10] + + +@pytest.fixture +def frame2(float_frame): + float_frame = float_frame.copy() + float_frame.columns = ['D', 'C', 'B', 'A'] + return float_frame[:10] + + +@pytest.fixture +def tsframe(): + return tm.makeTimeDataFrame()[:5] @contextlib.contextmanager @@ -49,18 +58,8 @@ def ignore_xlrd_time_clock_warning(): yield -class SharedItems: - - @pytest.fixture(autouse=True) - def setup_method(self, datapath): - self.frame = _frame.copy() - self.frame2 = _frame2.copy() - self.tsframe = _tsframe.copy() - self.mixed_frame = _mixed_frame.copy() - - @td.skip_if_no('xlrd', '1.0.0') -class ReadingTestsBase(SharedItems): +class ReadingTestsBase: # This is based on ExcelWriterBase @pytest.fixture(autouse=True, params=['xlrd', None]) @@ -1055,9 +1054,9 @@ class TestXlrdReader(ReadingTestsBase): """ @td.skip_if_no("xlwt") - def test_read_xlrd_book(self, ext): + def test_read_xlrd_book(self, ext, frame): import xlrd - df = self.frame + df = frame engine = "xlrd" sheet_name = "SheetA" @@ -1075,7 +1074,7 @@ def test_read_xlrd_book(self, ext): tm.assert_frame_equal(df, result) -class _WriterBase(SharedItems): +class _WriterBase: @pytest.fixture(autouse=True) def set_engine_and_path(self, request, merge_cells, engine, ext): @@ -1150,75 +1149,79 @@ def test_excel_sheet_by_name_raise(self, *_): with pytest.raises(xlrd.XLRDError): pd.read_excel(xl, "0") - def test_excel_writer_context_manager(self, *_): + def test_excel_writer_context_manager(self, frame, frame2, *_): with ExcelWriter(self.path) as writer: - self.frame.to_excel(writer, "Data1") - self.frame2.to_excel(writer, "Data2") + frame.to_excel(writer, "Data1") + frame2.to_excel(writer, "Data2") with ExcelFile(self.path) as reader: found_df = pd.read_excel(reader, "Data1", index_col=0) found_df2 = pd.read_excel(reader, "Data2", index_col=0) - tm.assert_frame_equal(found_df, self.frame) - tm.assert_frame_equal(found_df2, self.frame2) + tm.assert_frame_equal(found_df, frame) + tm.assert_frame_equal(found_df2, frame2) - def test_roundtrip(self, merge_cells, engine, ext): - self.frame['A'][:5] = nan + def test_roundtrip(self, merge_cells, engine, ext, frame): + frame = frame.copy() + frame['A'][:5] = nan - self.frame.to_excel(self.path, 'test1') - self.frame.to_excel(self.path, 'test1', columns=['A', 'B']) - self.frame.to_excel(self.path, 'test1', header=False) - self.frame.to_excel(self.path, 'test1', index=False) + frame.to_excel(self.path, 'test1') + frame.to_excel(self.path, 'test1', columns=['A', 'B']) + frame.to_excel(self.path, 'test1', header=False) + frame.to_excel(self.path, 'test1', index=False) # test roundtrip - self.frame.to_excel(self.path, 'test1') + frame.to_excel(self.path, 'test1') recons = pd.read_excel(self.path, 'test1', index_col=0) - tm.assert_frame_equal(self.frame, recons) + tm.assert_frame_equal(frame, recons) - self.frame.to_excel(self.path, 'test1', index=False) + frame.to_excel(self.path, 'test1', index=False) recons = pd.read_excel(self.path, 'test1', index_col=None) - recons.index = self.frame.index - tm.assert_frame_equal(self.frame, recons) + recons.index = frame.index + tm.assert_frame_equal(frame, recons) - self.frame.to_excel(self.path, 'test1', na_rep='NA') + frame.to_excel(self.path, 'test1', na_rep='NA') recons = pd.read_excel( self.path, 'test1', index_col=0, na_values=['NA']) - tm.assert_frame_equal(self.frame, recons) + tm.assert_frame_equal(frame, recons) # GH 3611 - self.frame.to_excel(self.path, 'test1', na_rep='88') + frame.to_excel(self.path, 'test1', na_rep='88') recons = pd.read_excel( self.path, 'test1', index_col=0, na_values=['88']) - tm.assert_frame_equal(self.frame, recons) + tm.assert_frame_equal(frame, recons) - self.frame.to_excel(self.path, 'test1', na_rep='88') + frame.to_excel(self.path, 'test1', na_rep='88') recons = pd.read_excel( self.path, 'test1', index_col=0, na_values=[88, 88.0]) - tm.assert_frame_equal(self.frame, recons) + tm.assert_frame_equal(frame, recons) # GH 6573 - self.frame.to_excel(self.path, 'Sheet1') + frame.to_excel(self.path, 'Sheet1') recons = pd.read_excel(self.path, index_col=0) - tm.assert_frame_equal(self.frame, recons) + tm.assert_frame_equal(frame, recons) - self.frame.to_excel(self.path, '0') + frame.to_excel(self.path, '0') recons = pd.read_excel(self.path, index_col=0) - tm.assert_frame_equal(self.frame, recons) + tm.assert_frame_equal(frame, recons) # GH 8825 Pandas Series should provide to_excel method - s = self.frame["A"] + s = frame["A"] s.to_excel(self.path) recons = pd.read_excel(self.path, index_col=0) tm.assert_frame_equal(s.to_frame(), recons) - def test_mixed(self, merge_cells, engine, ext): - self.mixed_frame.to_excel(self.path, 'test1') + def test_mixed(self, merge_cells, engine, ext, frame): + mixed_frame = frame.copy() + mixed_frame['foo'] = 'bar' + + mixed_frame.to_excel(self.path, 'test1') reader = ExcelFile(self.path) recons = pd.read_excel(reader, 'test1', index_col=0) - tm.assert_frame_equal(self.mixed_frame, recons) + tm.assert_frame_equal(mixed_frame, recons) - def test_ts_frame(self, *_): - df = tm.makeTimeDataFrame()[:5] + def test_ts_frame(self, tsframe, *_): + df = tsframe df.to_excel(self.path, "test1") reader = ExcelFile(self.path) @@ -1226,33 +1229,34 @@ def test_ts_frame(self, *_): recons = pd.read_excel(reader, "test1", index_col=0) tm.assert_frame_equal(df, recons) - def test_basics_with_nan(self, merge_cells, engine, ext): - self.frame['A'][:5] = nan - self.frame.to_excel(self.path, 'test1') - self.frame.to_excel(self.path, 'test1', columns=['A', 'B']) - self.frame.to_excel(self.path, 'test1', header=False) - self.frame.to_excel(self.path, 'test1', index=False) + def test_basics_with_nan(self, merge_cells, engine, ext, frame): + frame = frame.copy() + frame['A'][:5] = nan + frame.to_excel(self.path, 'test1') + frame.to_excel(self.path, 'test1', columns=['A', 'B']) + frame.to_excel(self.path, 'test1', header=False) + frame.to_excel(self.path, 'test1', index=False) @pytest.mark.parametrize("np_type", [ np.int8, np.int16, np.int32, np.int64]) def test_int_types(self, merge_cells, engine, ext, np_type): # Test np.int values read come back as int # (rather than float which is Excel's format). - frame = DataFrame(np.random.randint(-10, 10, size=(10, 2)), - dtype=np_type) - frame.to_excel(self.path, "test1") + df = DataFrame(np.random.randint(-10, 10, size=(10, 2)), + dtype=np_type) + df.to_excel(self.path, "test1") reader = ExcelFile(self.path) recons = pd.read_excel(reader, "test1", index_col=0) - int_frame = frame.astype(np.int64) + int_frame = df.astype(np.int64) tm.assert_frame_equal(int_frame, recons) recons2 = pd.read_excel(self.path, "test1", index_col=0) tm.assert_frame_equal(int_frame, recons2) # Test with convert_float=False comes back as float. - float_frame = frame.astype(float) + float_frame = df.astype(float) recons = pd.read_excel(self.path, "test1", convert_float=False, index_col=0) tm.assert_frame_equal(recons, float_frame, @@ -1263,120 +1267,123 @@ def test_int_types(self, merge_cells, engine, ext, np_type): np.float16, np.float32, np.float64]) def test_float_types(self, merge_cells, engine, ext, np_type): # Test np.float values read come back as float. - frame = DataFrame(np.random.random_sample(10), dtype=np_type) - frame.to_excel(self.path, "test1") + df = DataFrame(np.random.random_sample(10), dtype=np_type) + df.to_excel(self.path, "test1") reader = ExcelFile(self.path) recons = pd.read_excel(reader, "test1", index_col=0).astype(np_type) - tm.assert_frame_equal(frame, recons, check_dtype=False) + tm.assert_frame_equal(df, recons, check_dtype=False) @pytest.mark.parametrize("np_type", [np.bool8, np.bool_]) def test_bool_types(self, merge_cells, engine, ext, np_type): # Test np.bool values read come back as float. - frame = (DataFrame([1, 0, True, False], dtype=np_type)) - frame.to_excel(self.path, "test1") + df = (DataFrame([1, 0, True, False], dtype=np_type)) + df.to_excel(self.path, "test1") reader = ExcelFile(self.path) recons = pd.read_excel(reader, "test1", index_col=0).astype(np_type) - tm.assert_frame_equal(frame, recons) + tm.assert_frame_equal(df, recons) def test_inf_roundtrip(self, *_): - frame = DataFrame([(1, np.inf), (2, 3), (5, -np.inf)]) - frame.to_excel(self.path, "test1") + df = DataFrame([(1, np.inf), (2, 3), (5, -np.inf)]) + df.to_excel(self.path, "test1") reader = ExcelFile(self.path) recons = pd.read_excel(reader, "test1", index_col=0) - tm.assert_frame_equal(frame, recons) + tm.assert_frame_equal(df, recons) - def test_sheets(self, merge_cells, engine, ext): - self.frame['A'][:5] = nan + def test_sheets(self, merge_cells, engine, ext, frame, tsframe): + frame = frame.copy() + frame['A'][:5] = nan - self.frame.to_excel(self.path, 'test1') - self.frame.to_excel(self.path, 'test1', columns=['A', 'B']) - self.frame.to_excel(self.path, 'test1', header=False) - self.frame.to_excel(self.path, 'test1', index=False) + frame.to_excel(self.path, 'test1') + frame.to_excel(self.path, 'test1', columns=['A', 'B']) + frame.to_excel(self.path, 'test1', header=False) + frame.to_excel(self.path, 'test1', index=False) # Test writing to separate sheets writer = ExcelWriter(self.path) - self.frame.to_excel(writer, 'test1') - self.tsframe.to_excel(writer, 'test2') + frame.to_excel(writer, 'test1') + tsframe.to_excel(writer, 'test2') writer.save() reader = ExcelFile(self.path) recons = pd.read_excel(reader, 'test1', index_col=0) - tm.assert_frame_equal(self.frame, recons) + tm.assert_frame_equal(frame, recons) recons = pd.read_excel(reader, 'test2', index_col=0) - tm.assert_frame_equal(self.tsframe, recons) + tm.assert_frame_equal(tsframe, recons) assert 2 == len(reader.sheet_names) assert 'test1' == reader.sheet_names[0] assert 'test2' == reader.sheet_names[1] - def test_colaliases(self, merge_cells, engine, ext): - self.frame['A'][:5] = nan + def test_colaliases(self, merge_cells, engine, ext, frame, frame2): + frame = frame.copy() + frame['A'][:5] = nan - self.frame.to_excel(self.path, 'test1') - self.frame.to_excel(self.path, 'test1', columns=['A', 'B']) - self.frame.to_excel(self.path, 'test1', header=False) - self.frame.to_excel(self.path, 'test1', index=False) + frame.to_excel(self.path, 'test1') + frame.to_excel(self.path, 'test1', columns=['A', 'B']) + frame.to_excel(self.path, 'test1', header=False) + frame.to_excel(self.path, 'test1', index=False) # column aliases col_aliases = Index(['AA', 'X', 'Y', 'Z']) - self.frame2.to_excel(self.path, 'test1', header=col_aliases) + frame2.to_excel(self.path, 'test1', header=col_aliases) reader = ExcelFile(self.path) rs = pd.read_excel(reader, 'test1', index_col=0) - xp = self.frame2.copy() + xp = frame2.copy() xp.columns = col_aliases tm.assert_frame_equal(xp, rs) - def test_roundtrip_indexlabels(self, merge_cells, engine, ext): - self.frame['A'][:5] = nan + def test_roundtrip_indexlabels(self, merge_cells, engine, ext, frame): + frame = frame.copy() + frame['A'][:5] = nan - self.frame.to_excel(self.path, 'test1') - self.frame.to_excel(self.path, 'test1', columns=['A', 'B']) - self.frame.to_excel(self.path, 'test1', header=False) - self.frame.to_excel(self.path, 'test1', index=False) + frame.to_excel(self.path, 'test1') + frame.to_excel(self.path, 'test1', columns=['A', 'B']) + frame.to_excel(self.path, 'test1', header=False) + frame.to_excel(self.path, 'test1', index=False) # test index_label - frame = (DataFrame(np.random.randn(10, 2)) >= 0) - frame.to_excel(self.path, 'test1', - index_label=['test'], - merge_cells=merge_cells) + df = (DataFrame(np.random.randn(10, 2)) >= 0) + df.to_excel(self.path, 'test1', + index_label=['test'], + merge_cells=merge_cells) reader = ExcelFile(self.path) recons = pd.read_excel( reader, 'test1', index_col=0).astype(np.int64) - frame.index.names = ['test'] - assert frame.index.names == recons.index.names - - frame = (DataFrame(np.random.randn(10, 2)) >= 0) - frame.to_excel(self.path, - 'test1', - index_label=['test', 'dummy', 'dummy2'], - merge_cells=merge_cells) + df.index.names = ['test'] + assert df.index.names == recons.index.names + + df = (DataFrame(np.random.randn(10, 2)) >= 0) + df.to_excel(self.path, + 'test1', + index_label=['test', 'dummy', 'dummy2'], + merge_cells=merge_cells) reader = ExcelFile(self.path) recons = pd.read_excel( reader, 'test1', index_col=0).astype(np.int64) - frame.index.names = ['test'] - assert frame.index.names == recons.index.names - - frame = (DataFrame(np.random.randn(10, 2)) >= 0) - frame.to_excel(self.path, - 'test1', - index_label='test', - merge_cells=merge_cells) + df.index.names = ['test'] + assert df.index.names == recons.index.names + + df = (DataFrame(np.random.randn(10, 2)) >= 0) + df.to_excel(self.path, + 'test1', + index_label='test', + merge_cells=merge_cells) reader = ExcelFile(self.path) recons = pd.read_excel( reader, 'test1', index_col=0).astype(np.int64) - frame.index.names = ['test'] - tm.assert_frame_equal(frame, recons.astype(bool)) + df.index.names = ['test'] + tm.assert_frame_equal(df, recons.astype(bool)) - self.frame.to_excel(self.path, - 'test1', - columns=['A', 'B', 'C', 'D'], - index=False, merge_cells=merge_cells) + frame.to_excel(self.path, + 'test1', + columns=['A', 'B', 'C', 'D'], + index=False, merge_cells=merge_cells) # take 'A' and 'B' as indexes (same row as cols 'C', 'D') - df = self.frame.copy() + df = frame.copy() df = df.set_index(['A', 'B']) reader = ExcelFile(self.path) @@ -1395,17 +1402,17 @@ def test_excel_roundtrip_indexname(self, merge_cells, engine, ext): tm.assert_frame_equal(result, df) assert result.index.name == 'foo' - def test_excel_roundtrip_datetime(self, merge_cells, *_): + def test_excel_roundtrip_datetime(self, merge_cells, tsframe, *_): # datetime.date, not sure what to test here exactly - tsf = self.tsframe.copy() + tsf = tsframe.copy() - tsf.index = [x.date() for x in self.tsframe.index] + tsf.index = [x.date() for x in tsframe.index] tsf.to_excel(self.path, "test1", merge_cells=merge_cells) reader = ExcelFile(self.path) recons = pd.read_excel(reader, "test1", index_col=0) - tm.assert_frame_equal(self.tsframe, recons) + tm.assert_frame_equal(tsframe, recons) def test_excel_date_datetime_format(self, merge_cells, engine, ext): # see gh-4133 @@ -1450,14 +1457,14 @@ def test_to_excel_interval_no_labels(self, *_): # see gh-19242 # # Test writing Interval without labels. - frame = DataFrame(np.random.randint(-10, 10, size=(20, 1)), - dtype=np.int64) - expected = frame.copy() + df = DataFrame(np.random.randint(-10, 10, size=(20, 1)), + dtype=np.int64) + expected = df.copy() - frame["new"] = pd.cut(frame[0], 10) + df["new"] = pd.cut(df[0], 10) expected["new"] = pd.cut(expected[0], 10).astype(str) - frame.to_excel(self.path, "test1") + df.to_excel(self.path, "test1") reader = ExcelFile(self.path) recons = pd.read_excel(reader, "test1", index_col=0) @@ -1467,15 +1474,15 @@ def test_to_excel_interval_labels(self, *_): # see gh-19242 # # Test writing Interval with labels. - frame = DataFrame(np.random.randint(-10, 10, size=(20, 1)), - dtype=np.int64) - expected = frame.copy() - intervals = pd.cut(frame[0], 10, labels=["A", "B", "C", "D", "E", - "F", "G", "H", "I", "J"]) - frame["new"] = intervals + df = DataFrame(np.random.randint(-10, 10, size=(20, 1)), + dtype=np.int64) + expected = df.copy() + intervals = pd.cut(df[0], 10, labels=["A", "B", "C", "D", "E", + "F", "G", "H", "I", "J"]) + df["new"] = intervals expected["new"] = pd.Series(list(intervals)) - frame.to_excel(self.path, "test1") + df.to_excel(self.path, "test1") reader = ExcelFile(self.path) recons = pd.read_excel(reader, "test1", index_col=0) @@ -1485,23 +1492,23 @@ def test_to_excel_timedelta(self, *_): # see gh-19242, gh-9155 # # Test writing timedelta to xls. - frame = DataFrame(np.random.randint(-10, 10, size=(20, 1)), - columns=["A"], dtype=np.int64) - expected = frame.copy() + df = DataFrame(np.random.randint(-10, 10, size=(20, 1)), + columns=["A"], dtype=np.int64) + expected = df.copy() - frame["new"] = frame["A"].apply(lambda x: timedelta(seconds=x)) + df["new"] = df["A"].apply(lambda x: timedelta(seconds=x)) expected["new"] = expected["A"].apply( lambda x: timedelta(seconds=x).total_seconds() / float(86400)) - frame.to_excel(self.path, "test1") + df.to_excel(self.path, "test1") reader = ExcelFile(self.path) recons = pd.read_excel(reader, "test1", index_col=0) tm.assert_frame_equal(expected, recons) - def test_to_excel_periodindex(self, merge_cells, engine, ext): - frame = self.tsframe - xp = frame.resample('M', kind='period').mean() + def test_to_excel_periodindex( + self, merge_cells, engine, ext, tsframe): + xp = tsframe.resample('M', kind='period').mean() xp.to_excel(self.path, 'sht1') @@ -1509,8 +1516,7 @@ def test_to_excel_periodindex(self, merge_cells, engine, ext): rs = pd.read_excel(reader, 'sht1', index_col=0) tm.assert_frame_equal(xp, rs.to_period('M')) - def test_to_excel_multiindex(self, merge_cells, engine, ext): - frame = self.frame + def test_to_excel_multiindex(self, merge_cells, engine, ext, frame): arrays = np.arange(len(frame.index) * 2).reshape(2, -1) new_index = MultiIndex.from_arrays(arrays, names=['first', 'second']) @@ -1526,21 +1532,21 @@ def test_to_excel_multiindex(self, merge_cells, engine, ext): tm.assert_frame_equal(frame, df) # GH13511 - def test_to_excel_multiindex_nan_label(self, merge_cells, engine, ext): - frame = pd.DataFrame({'A': [None, 2, 3], - 'B': [10, 20, 30], - 'C': np.random.sample(3)}) - frame = frame.set_index(['A', 'B']) - - frame.to_excel(self.path, merge_cells=merge_cells) - df = pd.read_excel(self.path, index_col=[0, 1]) - tm.assert_frame_equal(frame, df) + def test_to_excel_multiindex_nan_label( + self, merge_cells, engine, ext): + df = pd.DataFrame({'A': [None, 2, 3], + 'B': [10, 20, 30], + 'C': np.random.sample(3)}) + df = df.set_index(['A', 'B']) + + df.to_excel(self.path, merge_cells=merge_cells) + df1 = pd.read_excel(self.path, index_col=[0, 1]) + tm.assert_frame_equal(df, df1) # Test for Issue 11328. If column indices are integers, make # sure they are handled correctly for either setting of # merge_cells - def test_to_excel_multiindex_cols(self, merge_cells, engine, ext): - frame = self.frame + def test_to_excel_multiindex_cols(self, merge_cells, engine, ext, frame): arrays = np.arange(len(frame.index) * 2).reshape(2, -1) new_index = MultiIndex.from_arrays(arrays, names=['first', 'second']) @@ -1563,9 +1569,9 @@ def test_to_excel_multiindex_cols(self, merge_cells, engine, ext): frame.columns = [".".join(map(str, q)) for q in zip(*fm)] tm.assert_frame_equal(frame, df) - def test_to_excel_multiindex_dates(self, merge_cells, engine, ext): + def test_to_excel_multiindex_dates( + self, merge_cells, engine, ext, tsframe): # try multiindex with dates - tsframe = self.tsframe.copy() new_index = [tsframe.index, np.arange(len(tsframe.index))] tsframe.index = MultiIndex.from_arrays(new_index) From 047d32d20640898978dbf6d9855cd6fecbbcf0d5 Mon Sep 17 00:00:00 2001 From: DanielFEvans <41120183+DanielFEvans@users.noreply.github.com> Date: Wed, 5 Jun 2019 19:44:38 +0100 Subject: [PATCH 47/51] ERR: include original error message for missing required dependencies (#26665) --- doc/source/whatsnew/v0.25.0.rst | 2 +- pandas/__init__.py | 8 +++++--- pandas/tests/test_base.py | 27 +++++++++++++++++++++++++++ 3 files changed, 33 insertions(+), 4 deletions(-) diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index 4018418294963..8fd9f07442810 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -82,7 +82,7 @@ Other Enhancements - :meth:`DataFrame.query` and :meth:`DataFrame.eval` now supports quoting column names with backticks to refer to names with spaces (:issue:`6508`) - :func:`merge_asof` now gives a more clear error message when merge keys are categoricals that are not equal (:issue:`26136`) - :meth:`pandas.core.window.Rolling` supports exponential (or Poisson) window type (:issue:`21303`) -- +- Error message for missing required imports now includes the original ImportError's text (:issue:`23868`) .. _whatsnew_0250.api_breaking: diff --git a/pandas/__init__.py b/pandas/__init__.py index 4c494b4a62e39..11ea3047bb62a 100644 --- a/pandas/__init__.py +++ b/pandas/__init__.py @@ -10,11 +10,13 @@ try: __import__(dependency) except ImportError as e: - missing_dependencies.append(dependency) + missing_dependencies.append((dependency, e)) if missing_dependencies: - raise ImportError( - "Missing required dependencies {0}".format(missing_dependencies)) + msg = "Unable to import required dependencies:" + for dependency, e in missing_dependencies: + msg += "\n{0}: {1}".format(dependency, str(e)) + raise ImportError(msg) del hard_dependencies, dependency, missing_dependencies # numpy compat diff --git a/pandas/tests/test_base.py b/pandas/tests/test_base.py index 3b4f85e680f6e..f8319999682e8 100644 --- a/pandas/tests/test_base.py +++ b/pandas/tests/test_base.py @@ -1,7 +1,9 @@ from datetime import datetime, timedelta +from importlib import reload from io import StringIO import re import sys +from unittest.mock import patch import numpy as np import pytest @@ -1341,3 +1343,28 @@ def test_to_numpy_dtype(as_series): expected = np.array(['2000-01-01T05', '2001-01-01T05'], dtype='M8[ns]') tm.assert_numpy_array_equal(result, expected) + + +@patch("builtins.__import__") +def test_missing_required_dependency(mock_import): + def mock_import_fail(name, *args, **kwargs): + if name == "numpy": + raise ImportError("cannot import name numpy") + elif name == "pytz": + raise ImportError("cannot import name some_dependency") + elif name == "dateutil": + raise ImportError("cannot import name some_other_dependency") + else: + return __import__(name, *args, **kwargs) + + mock_import.side_effect = mock_import_fail + + expected_msg = ( + "Unable to import required dependencies:" + "\nnumpy: cannot import name numpy" + "\npytz: cannot import name some_dependency" + "\ndateutil: cannot import name some_other_dependency" + ) + + with pytest.raises(ImportError, match=expected_msg): + reload(pd) From 1d7ad5fd7577f3da1c8eb19cf547f62d392405d0 Mon Sep 17 00:00:00 2001 From: nathalier Date: Wed, 5 Jun 2019 20:06:13 +0100 Subject: [PATCH 48/51] BUG: fix TypeError for invalid integer dates %Y%m%d with errors='ignore' (# GH 26583) (#26585) --- doc/source/whatsnew/v0.25.0.rst | 1 + pandas/_libs/tslibs/strptime.pyx | 6 +++--- pandas/tests/indexes/datetimes/test_tools.py | 19 +++++++++++++++++++ 3 files changed, 23 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index 8fd9f07442810..02ee275bab364 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -537,6 +537,7 @@ Datetimelike - Bug in :func:`to_datetime` which does not replace the invalid argument with ``NaT`` when error is set to coerce (:issue:`26122`) - Bug in adding :class:`DateOffset` with nonzero month to :class:`DatetimeIndex` would raise ``ValueError`` (:issue:`26258`) - Bug in :func:`to_datetime` which raises unhandled ``OverflowError`` when called with mix of invalid dates and ``NaN`` values with ``format='%Y%m%d'`` and ``error='coerce'`` (:issue:`25512`) +- Bug in :func:`to_datetime` which raises ``TypeError`` for ``format='%Y%m%d'`` when called for invalid integer dates with length >= 6 digits with ``errors='ignore'`` Timedelta ^^^^^^^^^ diff --git a/pandas/_libs/tslibs/strptime.pyx b/pandas/_libs/tslibs/strptime.pyx index af3d3fa646a12..d93858cff5e05 100644 --- a/pandas/_libs/tslibs/strptime.pyx +++ b/pandas/_libs/tslibs/strptime.pyx @@ -140,13 +140,13 @@ def array_strptime(object[:] values, object fmt, iresult[i] = NPY_NAT continue raise ValueError("time data %r does not match " - "format %r (match)" % (values[i], fmt)) + "format %r (match)" % (val, fmt)) if len(val) != found.end(): if is_coerce: iresult[i] = NPY_NAT continue raise ValueError("unconverted data remains: %s" % - values[i][found.end():]) + val[found.end():]) # search else: @@ -156,7 +156,7 @@ def array_strptime(object[:] values, object fmt, iresult[i] = NPY_NAT continue raise ValueError("time data %r does not match format " - "%r (search)" % (values[i], fmt)) + "%r (search)" % (val, fmt)) iso_year = -1 year = 1900 diff --git a/pandas/tests/indexes/datetimes/test_tools.py b/pandas/tests/indexes/datetimes/test_tools.py index c507c31ee54dd..ea33e563b31be 100644 --- a/pandas/tests/indexes/datetimes/test_tools.py +++ b/pandas/tests/indexes/datetimes/test_tools.py @@ -133,6 +133,25 @@ def test_to_datetime_format_integer(self, cache): result = to_datetime(s, format='%Y%m', cache=cache) assert_series_equal(result, expected) + @pytest.mark.parametrize('int_date, expected', [ + # valid date, length == 8 + [20121030, datetime(2012, 10, 30)], + # short valid date, length == 6 + [199934, datetime(1999, 3, 4)], + # long integer date partially parsed to datetime(2012,1,1), length > 8 + [2012010101, 2012010101], + # invalid date partially parsed to datetime(2012,9,9), length == 8 + [20129930, 20129930], + # short integer date partially parsed to datetime(2012,9,9), length < 8 + [2012993, 2012993], + # short invalid date, length == 4 + [2121, 2121]]) + def test_int_to_datetime_format_YYYYMMDD_typeerror(self, int_date, + expected): + # GH 26583 + result = to_datetime(int_date, format='%Y%m%d', errors='ignore') + assert result == expected + @pytest.mark.parametrize('cache', [True, False]) def test_to_datetime_format_microsecond(self, cache): From 30d9cf30c680596fc6e00b3e06a30d2fc62bad69 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Wed, 5 Jun 2019 22:30:45 +0200 Subject: [PATCH 49/51] Revert "ERR: include original error message for missing required dependencies (#26665)" This reverts commit 047d32d20640898978dbf6d9855cd6fecbbcf0d5. --- doc/source/whatsnew/v0.25.0.rst | 2 +- pandas/__init__.py | 8 +++----- pandas/tests/test_base.py | 27 --------------------------- 3 files changed, 4 insertions(+), 33 deletions(-) diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index 02ee275bab364..1fb9b5ae695a0 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -82,7 +82,7 @@ Other Enhancements - :meth:`DataFrame.query` and :meth:`DataFrame.eval` now supports quoting column names with backticks to refer to names with spaces (:issue:`6508`) - :func:`merge_asof` now gives a more clear error message when merge keys are categoricals that are not equal (:issue:`26136`) - :meth:`pandas.core.window.Rolling` supports exponential (or Poisson) window type (:issue:`21303`) -- Error message for missing required imports now includes the original ImportError's text (:issue:`23868`) +- .. _whatsnew_0250.api_breaking: diff --git a/pandas/__init__.py b/pandas/__init__.py index 11ea3047bb62a..4c494b4a62e39 100644 --- a/pandas/__init__.py +++ b/pandas/__init__.py @@ -10,13 +10,11 @@ try: __import__(dependency) except ImportError as e: - missing_dependencies.append((dependency, e)) + missing_dependencies.append(dependency) if missing_dependencies: - msg = "Unable to import required dependencies:" - for dependency, e in missing_dependencies: - msg += "\n{0}: {1}".format(dependency, str(e)) - raise ImportError(msg) + raise ImportError( + "Missing required dependencies {0}".format(missing_dependencies)) del hard_dependencies, dependency, missing_dependencies # numpy compat diff --git a/pandas/tests/test_base.py b/pandas/tests/test_base.py index f8319999682e8..3b4f85e680f6e 100644 --- a/pandas/tests/test_base.py +++ b/pandas/tests/test_base.py @@ -1,9 +1,7 @@ from datetime import datetime, timedelta -from importlib import reload from io import StringIO import re import sys -from unittest.mock import patch import numpy as np import pytest @@ -1343,28 +1341,3 @@ def test_to_numpy_dtype(as_series): expected = np.array(['2000-01-01T05', '2001-01-01T05'], dtype='M8[ns]') tm.assert_numpy_array_equal(result, expected) - - -@patch("builtins.__import__") -def test_missing_required_dependency(mock_import): - def mock_import_fail(name, *args, **kwargs): - if name == "numpy": - raise ImportError("cannot import name numpy") - elif name == "pytz": - raise ImportError("cannot import name some_dependency") - elif name == "dateutil": - raise ImportError("cannot import name some_other_dependency") - else: - return __import__(name, *args, **kwargs) - - mock_import.side_effect = mock_import_fail - - expected_msg = ( - "Unable to import required dependencies:" - "\nnumpy: cannot import name numpy" - "\npytz: cannot import name some_dependency" - "\ndateutil: cannot import name some_other_dependency" - ) - - with pytest.raises(ImportError, match=expected_msg): - reload(pd) From f8b4c57ad1e4f1a105905c53ffcf40a5dc5080c3 Mon Sep 17 00:00:00 2001 From: AlexTereshenkov <50622389+AlexTereshenkov@users.noreply.github.com> Date: Wed, 5 Jun 2019 22:37:54 +0100 Subject: [PATCH 50/51] Remove redundant check arr_or_dtype is None (#26655) --- pandas/core/dtypes/common.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/pandas/core/dtypes/common.py b/pandas/core/dtypes/common.py index 4029e6f4bfdb5..52011d53d22cd 100644 --- a/pandas/core/dtypes/common.py +++ b/pandas/core/dtypes/common.py @@ -1931,8 +1931,6 @@ def _is_dtype_type(arr_or_dtype, condition): if issubclass(arr_or_dtype, ExtensionDtype): arr_or_dtype = arr_or_dtype.type return condition(np.dtype(arr_or_dtype).type) - elif arr_or_dtype is None: - return condition(type(None)) # if we have an array-like if hasattr(arr_or_dtype, 'dtype'): From 891a419a5155e6b42c0696a81cf853b6f3febbf7 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Wed, 5 Jun 2019 16:48:47 -0500 Subject: [PATCH 51/51] filter warning in repr (#26669) --- pandas/core/sparse/frame.py | 5 +++++ pandas/core/sparse/series.py | 10 ++++++---- pandas/tests/sparse/test_format.py | 13 +++++++++++++ 3 files changed, 24 insertions(+), 4 deletions(-) diff --git a/pandas/core/sparse/frame.py b/pandas/core/sparse/frame.py index 0320da6d9a48d..67ecbcbea67f9 100644 --- a/pandas/core/sparse/frame.py +++ b/pandas/core/sparse/frame.py @@ -242,6 +242,11 @@ def _init_spmatrix(self, data, index, columns, dtype=None, def to_coo(self): return SparseFrameAccessor(self).to_coo() + def __repr__(self): + with warnings.catch_warnings(): + warnings.filterwarnings("ignore", "Sparse") + return super().__repr__() + def __getstate__(self): # pickling return dict(_typ=self._typ, _subtyp=self._subtyp, _data=self._data, diff --git a/pandas/core/sparse/series.py b/pandas/core/sparse/series.py index 3814d8bb66635..3e3bae6444082 100644 --- a/pandas/core/sparse/series.py +++ b/pandas/core/sparse/series.py @@ -214,10 +214,12 @@ def as_sparse_array(self, kind=None, fill_value=None, copy=False): fill_value=fill_value, kind=kind, copy=copy) def __repr__(self): - series_rep = Series.__repr__(self) - rep = '{series}\n{index!r}'.format(series=series_rep, - index=self.sp_index) - return rep + with warnings.catch_warnings(): + warnings.filterwarnings("ignore", "Sparse") + series_rep = Series.__repr__(self) + rep = '{series}\n{index!r}'.format(series=series_rep, + index=self.sp_index) + return rep def _reduce(self, op, name, axis=0, skipna=True, numeric_only=None, filter_type=None, **kwds): diff --git a/pandas/tests/sparse/test_format.py b/pandas/tests/sparse/test_format.py index 37c2acc587cf6..7ed8c48fce333 100644 --- a/pandas/tests/sparse/test_format.py +++ b/pandas/tests/sparse/test_format.py @@ -1,3 +1,5 @@ +import warnings + import numpy as np import pytest @@ -133,3 +135,14 @@ def test_sparse_repr_after_set(self): repr(sdf) tm.assert_sp_frame_equal(sdf, res) + + +def test_repr_no_warning(): + with warnings.catch_warnings(): + warnings.simplefilter("ignore", FutureWarning) + df = pd.SparseDataFrame({"A": [1, 2]}) + s = df['A'] + + with tm.assert_produces_warning(None): + repr(df) + repr(s)