From 3e5879440a6b513233f71f62de9234f67125dee7 Mon Sep 17 00:00:00 2001 From: Adam Hooper Date: Thu, 14 Jun 2018 13:09:44 -0400 Subject: [PATCH] Handle colspan and rowspan This is essentially a rebased and squashed #17054 (mad props to @jowens for doing all the hard thinking). My tweaks: * test_computer_sales_page (see #17074) no longer tests for ParserError, because the ParserError was a bug caused by missing colspan support. Now, test that MultiIndex works as expected. * I respectfully removed the fill_rowspan argument from #17073. Instead, the virtual cells created by rowspan/colspan are always copies of the real cells' text. This prevents _infer_columns() from naming virtual cells as "Unnamed: ..." * I removed a small layer of abstraction to respect #20891 (multiple support), which was implemented after @jowens' pull request. Now _HtmlFrameParser has _parse_thead_trs, _parse_tbody_trs and _parse_tfoot_trs, each returning a list of s. That let me remove _parse_tr, Making All The Tests Pass. * That caused a snowball effect. lxml does not fix malformed , as tested by spam.html. The previous hacky workaround was in _parse_raw_thead, but the new _parse_thead_trs signature returns nodes instead of text. The new hacky solution: return the itself, pretending it's a . This works in all the tests. A better solution is to use html5lib with lxml; but that might belong in a separate pull request. --- doc/source/whatsnew/v0.24.0.txt | 3 +- pandas/io/html.py | 368 ++++++++++++++++++-------------- pandas/tests/io/test_html.py | 210 ++++++++++++++++-- 3 files changed, 405 insertions(+), 176 deletions(-) diff --git a/doc/source/whatsnew/v0.24.0.txt b/doc/source/whatsnew/v0.24.0.txt index abf574ae109fd..0f0ad3452e934 100644 --- a/doc/source/whatsnew/v0.24.0.txt +++ b/doc/source/whatsnew/v0.24.0.txt @@ -24,6 +24,7 @@ Other Enhancements `__. (:issue:`21627`) - New method :meth:`HDFStore.walk` will recursively walk the group hierarchy of an HDF5 file (:issue:`10932`) +- :func:`read_html` handles colspan and rowspan arguments and attempts to infer a header if the header is not explicitly specified (:issue:`17054`) - .. _whatsnew_0240.api_breaking: @@ -223,7 +224,7 @@ MultiIndex I/O ^^^ -- +- :func:`read_html()` no longer ignores all-whitespace ```` within ```` when considering the ``skiprows`` and ``header`` arguments. Previously, users had to decrease their ``header`` and ``skiprows`` values on such tables to work around the issue. (:issue:`21641`) - - diff --git a/pandas/io/html.py b/pandas/io/html.py index 8fd876e85889f..6e774f1846b99 100644 --- a/pandas/io/html.py +++ b/pandas/io/html.py @@ -10,13 +10,11 @@ from distutils.version import LooseVersion -import numpy as np - from pandas.core.dtypes.common import is_list_like from pandas.errors import EmptyDataError from pandas.io.common import _is_url, urlopen, _validate_header_arg from pandas.io.parsers import TextParser -from pandas.compat import (lrange, lmap, u, string_types, iteritems, +from pandas.compat import (lrange, lmap, lfilter, u, string_types, iteritems, raise_with_traceback, binary_type) from pandas import Series import pandas.core.common as com @@ -193,11 +191,11 @@ class _HtmlFrameParser(object): * :func:`_build_doc` * :func:`_text_getter` * :func:`_parse_td` + * :func:`_parse_thead_tr` + * :func:`_parse_tbody_tr` + * :func:`_parse_tfoot_tr` * :func:`_parse_tables` - * :func:`_parse_tr` - * :func:`_parse_thead` - * :func:`_parse_tbody` - * :func:`_parse_tfoot` + * :func:`_equals_tag` See each method's respective documentation for details on their functionality. """ @@ -210,32 +208,14 @@ def __init__(self, io, match, attrs, encoding, displayed_only): self.displayed_only = displayed_only def parse_tables(self): - tables = self._parse_tables(self._build_doc(), self.match, self.attrs) - return (self._build_table(table) for table in tables) - - def _parse_raw_data(self, rows): - """Parse the raw data into a list of lists. - - Parameters - ---------- - rows : iterable of node-like - A list of row elements. - - text_getter : callable - A callable that gets the text from an individual node. This must be - defined by subclasses. - - column_finder : callable - A callable that takes a row node as input and returns a list of the - column node in that row. This must be defined by subclasses. + """Parse and return all tables from the DOM. Returns ------- - data : list of list of strings + tables : list of parsed (header, body, footer) tuples from tables """ - data = [[_remove_whitespace(self._text_getter(col)) for col in - self._parse_td(row)] for row in rows] - return data + tables = self._parse_tables(self._build_doc(), self.match, self.attrs) + return (self._parse_thead_tbody_tfoot(table) for table in tables) def _text_getter(self, obj): """Return the text of an individual DOM node. @@ -257,7 +237,7 @@ def _parse_td(self, obj): Parameters ---------- - obj : node-like + obj : an HTML row element Returns ------- @@ -266,90 +246,88 @@ def _parse_td(self, obj): """ raise com.AbstractMethodError(self) - def _parse_tables(self, doc, match, attrs): - """Return all tables from the parsed DOM. + def _parse_thead_tr(self, table): + """Return the list of thead row elements from the parsed table element. Parameters ---------- - doc : tree-like - The DOM from which to parse the table element. - - match : str or regular expression - The text to search for in the DOM tree. - - attrs : dict - A dictionary of table attributes that can be used to disambiguate - multiple tables on a page. - - Raises - ------ - ValueError - * If `match` does not match any text in the document. + table : a table element that contains zero or more thead elements. Returns ------- - tables : list of node-like - A list of elements to be parsed into raw data. + rows : list of row elements of a table """ raise com.AbstractMethodError(self) - def _parse_tr(self, table): - """Return the list of row elements from the parsed table element. + def _parse_tbody_tr(self, table): + """Return the list of tbody row elements from the parsed table element. + + HTML5 table bodies consist of either 0 or more elements (which + only contain elements) or 0 or more elements. This method + checks for both structures. Parameters ---------- - table : node-like - A table element that contains row elements. + table : a table element that contains row elements. Returns ------- - rows : list of node-like - A list row elements of a table, usually or row elements of a table """ raise com.AbstractMethodError(self) - def _parse_thead(self, table): - """Return the header of a table. + def _parse_tfoot_tr(self, table): + """Return the list of tfoot row elements from the parsed table element. Parameters ---------- - table : node-like - A table element that contains row elements. + table : a table element that contains row elements. Returns ------- - thead : node-like - A ... element. + rows : list of row elements of a table """ raise com.AbstractMethodError(self) - def _parse_tbody(self, table): - """Return the list of tbody elements from the parsed table element. + def _parse_tables(self, doc, match, attrs): + """Return all tables from the parsed DOM. Parameters ---------- - table : node-like - A table element that contains row elements. + doc : the DOM from which to parse the table element. + + match : str or regular expression + The text to search for in the DOM tree. + + attrs : dict + A dictionary of table attributes that can be used to disambiguate + multiple tables on a page. + + Raises + ------ + ValueError : `match` does not match any text in the document. Returns ------- - tbodys : list of node-like - A list of ... elements + tables : list of HTML
elements. + rows : list of
elements to be parsed into raw data. """ raise com.AbstractMethodError(self) - def _parse_tfoot(self, table): - """Return the footer of the table if any. + def _equals_tag(self, obj, tag): + """Return whether an individual DOM node matches a tag Parameters ---------- - table : node-like - A table element that contains row elements. + obj : node-like + A DOM node. + + tag : str + Tag name to be checked for equality Returns ------- - tfoot : node-like - A ... element. + is_tag_equal : boolean + boolean indicating if the object is equal to tag 'tag' """ raise com.AbstractMethodError(self) @@ -358,47 +336,115 @@ def _build_doc(self): Returns ------- - obj : tree-like + obj : the DOM from which to parse the table element. """ raise com.AbstractMethodError(self) - def _build_table(self, table): - header = self._parse_raw_thead(table) - body = self._parse_raw_tbody(table) - footer = self._parse_raw_tfoot(table) - return header, body, footer + def _parse_thead_tbody_tfoot(self, table_html): + """Given a table, return parsed header, body, and foot. + Header and body are lists-of-lists. Top level list is a list of + rows. Each row is a list of parsed elements. - def _parse_raw_thead(self, table): - thead = self._parse_thead(table) - res = [] - if thead: - trs = self._parse_tr(thead[0]) - for tr in trs: - cols = lmap(self._text_getter, self._parse_td(tr)) - if any(col != '' for col in cols): - res.append(cols) - return res + Logic: Use , , elements to identify + header, body, and footer, otherwise: + - Put all rows into body + - Move rows from top of body to header only if + all elements inside row are . Treat first all-. Treat last all-s, return a list of text rows that copy cell + text across rowspans/colspans. - return self._parse_raw_data(raw_data) + Parameters + ---------- + rows : list of s + + Returns + ------- + res : list of rows, each of which is a list of str in that row + """ + + res = [] + saved_span = [] + for row in rows: + extracted_row = self._parse_td(row) + cols_text = [_remove_whitespace( + self._text_getter(col)) for col in extracted_row] + col_colspans = [int(col.get('colspan', 1)) + for col in extracted_row] + col_rowspans = [int(col.get('rowspan', 1)) + for col in extracted_row] + # expand cols using col_colspans + # maybe this can be done with a list comprehension, dunno + cols = list(zip( + list(com.flatten( + lmap(lambda text_nc: [text_nc[0]] * text_nc[1], + list(zip(cols_text, col_colspans))))), + list(com.flatten( + lmap(lambda nc_nr: [nc_nr[1]] * nc_nr[0], + list(zip(col_colspans, col_rowspans)))))) + ) + # cols is now a list of (text, number of rows) + # now insert any previous rowspans + for (col, (text, nr)) in saved_span: + cols.insert(col, (text, nr)) + + # save next saved_span + def advance_item_to_next_row(item): + (col, (text, nr)) = item + if nr == 1: + return None + else: + return (col, (text, nr - 1)) + saved_span = lfilter(lambda i: i is not None, + lmap(advance_item_to_next_row, + list(enumerate(cols)))) + cols = [text for (text, nr) in cols] + # generate cols with text only + if any([col != '' for col in cols]): + res.append(cols) + + return res def _handle_hidden_tables(self, tbl_list, attr_name): """Returns list of tables, potentially removing hidden elements @@ -442,27 +488,6 @@ def __init__(self, *args, **kwargs): from bs4 import SoupStrainer self._strainer = SoupStrainer('table') - def _text_getter(self, obj): - return obj.text - - def _parse_td(self, row): - return row.find_all(('td', 'th')) - - def _parse_tr(self, element): - return element.find_all('tr') - - def _parse_th(self, element): - return element.find_all('th') - - def _parse_thead(self, table): - return table.find_all('thead') - - def _parse_tbody(self, table): - return table.find_all('tbody') - - def _parse_tfoot(self, table): - return table.find_all('tfoot') - def _parse_tables(self, doc, match, attrs): element_name = self._strainer.name tables = doc.find_all(element_name, attrs=attrs) @@ -490,6 +515,27 @@ def _parse_tables(self, doc, match, attrs): .format(patt=match.pattern)) return result + def _text_getter(self, obj): + return obj.text + + def _equals_tag(self, obj, tag): + return obj.name == tag + + def _parse_td(self, row): + return row.find_all(('td', 'th'), recursive=False) + + def _parse_thead_tr(self, table): + return table.select('thead tr') + + def _parse_tbody_tr(self, table): + from_tbody = table.select('tbody tr') + from_root = table.find_all('tr', recursive=False) + # HTML spec: at most one of these lists has content + return from_tbody + from_root + + def _parse_tfoot_tr(self, table): + return table.select('tfoot tr') + def _setup_build_doc(self): raw_text = _read(self.io) if not raw_text: @@ -554,10 +600,9 @@ def _text_getter(self, obj): return obj.text_content() def _parse_td(self, row): - return row.xpath('.//td|.//th') - - def _parse_tr(self, table): - return table.xpath('.//tr') + # Look for direct descendents only: the "row" element here may be a + # or (see _parse_thead_tr). + return row.xpath('./td|./th') def _parse_tables(self, doc, match, kwargs): pattern = match.pattern @@ -590,6 +635,12 @@ def _parse_tables(self, doc, match, kwargs): .format(patt=pattern)) return tables + def _equals_tag(self, obj, tag): + return obj.tag == tag + + def _contains_tag(self, obj, tag): + return obj.find(tag) is not None + def _build_doc(self): """ Raises @@ -637,41 +688,30 @@ def _build_doc(self): raise XMLSyntaxError("no text parsed from document", 0, 0, 0) return r - def _parse_tbody(self, table): - return table.xpath('.//tbody') + def _parse_thead_tr(self, table): + rows = [] - def _parse_thead(self, table): - return table.xpath('.//thead') + for thead in table.xpath('.//thead'): + rows.extend(thead.xpath('./tr')) - def _parse_tfoot(self, table): - return table.xpath('.//tfoot') + # lxml does not clean up the clearly-erroneous + # . + elements_at_root = thead.xpath('./td|./th') + if elements_at_root: + # Pass the entire as a row. _parse_td() will interpret + # it correctly. + rows.append(thead) - def _parse_raw_thead(self, table): - expr = './/thead' - thead = table.xpath(expr) - res = [] - if thead: - # Grab any directly descending table headers first - ths = thead[0].xpath('./th') - if ths: - cols = [_remove_whitespace(x.text_content()) for x in ths] - if any(col != '' for col in cols): - res.append(cols) - else: - trs = self._parse_tr(thead[0]) + return rows - for tr in trs: - cols = [_remove_whitespace(x.text_content()) for x in - self._parse_td(tr)] + def _parse_tbody_tr(self, table): + from_tbody = table.xpath('.//tbody//tr') + from_root = table.xpath('./tr') + # HTML spec: at most one of these lists has content + return from_tbody + from_root - if any(col != '' for col in cols): - res.append(cols) - return res - - def _parse_raw_tfoot(self, table): - expr = './/tfoot//th|//tfoot//td' - return [_remove_whitespace(x.text_content()) for x in - table.xpath(expr)] + def _parse_tfoot_tr(self, table): + return table.xpath('.//tfoot//tr') def _expand_elements(body): @@ -695,7 +735,7 @@ def _data_to_frame(**kwargs): header = 0 if rows == [0] else rows if foot: - body += [foot] + body += foot # fill out elements of body that are "ragged" _expand_elements(body) @@ -953,7 +993,13 @@ def read_html(io, match='.+', flavor=None, header=None, index_col=None, This function searches for ``
+ - Move rows from bottom of body to footer only if + all elements inside row are - def _parse_raw_tfoot(self, table): - tfoot = self._parse_tfoot(table) - res = [] - if tfoot: - res = lmap(self._text_getter, self._parse_td(tfoot[0])) - return np.atleast_1d( - np.array(res).squeeze()) if res and len(res) == 1 else res + Parameters + ---------- + table_html : a single HTML table element. - def _parse_raw_tbody(self, table): - tbodies = self._parse_tbody(table) + Returns + ------- + tuple of (header, body, footer) + header : list of rows, each of which is a list of parsed + header elements + body : list of rows, each of which is a list of parsed body elements + footer : list of rows, each of which is a list of parsed + footer elements + """ - raw_data = [] + header_rows = self._parse_thead_tr(table_html) + body_rows = self._parse_tbody_tr(table_html) + footer_rows = self._parse_tfoot_tr(table_html) - if tbodies: - for tbody in tbodies: - raw_data.extend(self._parse_tr(tbody)) - else: - raw_data.extend(self._parse_tr(table)) + if not header_rows: + # The table has no
rows as headers. + while body_rows and all(self._equals_tag(t, 'th') for t in + self._parse_td(body_rows[0])): + # this row should be a header row, move it from body to header + header_rows.append(body_rows.pop(0)) + + if not footer_rows: + # The table has no
rows as footers. + while body_rows and all(self._equals_tag(t, 'th') for t in + self._parse_td(body_rows[-1])): + # this row should be a footer row, move it from body to footer + footer_rows.insert(0, body_rows.pop()) + + header = self._expand_colspan_rowspan(header_rows) + body = self._expand_colspan_rowspan(body_rows) + footer = self._expand_colspan_rowspan(footer_rows) + + return header, body, footer + + def _expand_colspan_rowspan(self, rows): + """Given a list of
foobar
`` elements and only for ```` and ```` or ```` argument, it is used to construct + the header, otherwise the function attempts to find the header within + the body (by putting rows with only `` within on malformed HTML. + """ + data1 = StringIO('''
`` rows and ```` elements within each ``
`` - element in the table. ```` stands for "table data". + element in the table. ```` stands for "table data". This function + attempts to properly handle ``colspan`` and ``rowspan`` attributes. + If the function has a ``
`` elements into the header). + + .. versionadded:: 0.21.0 Similar to :func:`~pandas.read_csv` the `header` argument is applied **after** `skiprows` is applied. diff --git a/pandas/tests/io/test_html.py b/pandas/tests/io/test_html.py index 9c6a8de7ed446..b8f520ee17d72 100644 --- a/pandas/tests/io/test_html.py +++ b/pandas/tests/io/test_html.py @@ -18,7 +18,6 @@ from pandas.io.common import URLError, file_path_to_url import pandas.io.html from pandas.io.html import read_html -from pandas._libs.parsers import ParserError import pandas.util.testing as tm import pandas.util._test_decorators as td @@ -129,16 +128,7 @@ def test_banklist(self): assert_framelist_equal(df1, df2) - def test_spam_no_types(self): - - # infer_types removed in #10892 - df1 = self.read_html(self.spam_data, '.*Water.*') - df2 = self.read_html(self.spam_data, 'Unit') - assert_framelist_equal(df1, df2) - assert df1[0].iloc[0, 0] == 'Proximates' - assert df1[0].columns[0] == 'Nutrient' - - def test_spam_with_types(self): + def test_spam(self): df1 = self.read_html(self.spam_data, '.*Water.*') df2 = self.read_html(self.spam_data, 'Unit') assert_framelist_equal(df1, df2) @@ -372,7 +362,7 @@ def test_thousands_macau_stats(self, datapath): attrs={'class': 'style1'}) df = dfs[all_non_nan_table_index] - assert not any(s.isna().any() for _, s in df.iteritems()) + assert not any(s.isnull().any() for _, s in df.iteritems()) @pytest.mark.slow def test_thousands_macau_index_col(self, datapath): @@ -381,7 +371,7 @@ def test_thousands_macau_index_col(self, datapath): dfs = self.read_html(macau_data, index_col=0, header=0) df = dfs[all_non_nan_table_index] - assert not any(s.isna().any() for _, s in df.iteritems()) + assert not any(s.isnull().any() for _, s in df.iteritems()) def test_empty_tables(self): """ @@ -461,6 +451,44 @@ def test_header_and_one_column(self): result = self.read_html(data)[0] tm.assert_frame_equal(result, expected) + def test_thead_without_tr(self): + """ + Ensure parser adds
+ + + + + + + + + + + + + + +
CountryMunicipalityYear
UkraineOdessa1944
''') + data2 = StringIO(''' + + + + + + + + + + + + +
CountryMunicipalityYear
UkraineOdessa1944
''') + res1 = self.read_html(data1) + res2 = self.read_html(data2, header=0) + assert_framelist_equal(res1, res2) + def test_tfoot_read(self): """ Make sure that read_html reads tfoot, containing td or th. @@ -592,7 +620,7 @@ def test_gold_canyon(self): attrs={'id': 'table'})[0] assert gc in df.to_string() - def test_different_number_of_rows(self): + def test_different_number_of_cols(self): expected = """ @@ -654,6 +682,160 @@ def test_different_number_of_rows(self): res = self.read_html(out, index_col=0)[0] tm.assert_frame_equal(expected, res) + def test_colspan_rowspan_are_1(self): + # GH17054 + expected = """
+ + + + + + + + + + +
XYZW
""" + out = """ + + + + + + + + + + +
XYZW
""" + expected = self.read_html(expected)[0] + res = self.read_html(out)[0] + tm.assert_frame_equal(expected, res) + + def test_colspan_rowspan_are_more_than_1(self): + # GH17054 + expected = """ + + + + + + + + + + + + + + + + + + +
XXYZW
122Z3
""" + out = """ + + + + + + + + + + + + + + + +
XYZW
123
""" + expected = self.read_html(expected)[0] + res = self.read_html(out)[0] + tm.assert_frame_equal(expected, res) + + def test_tbody_colspan_rowspan_copy_values(self): + # GH17054 + expected = """ + + + + + + + + + + + + + + + + +
11234
56637
""" + out = """ + + + + + + + + + + + + + +
1234
567
""" + expected = self.read_html(expected)[0] + res = self.read_html(out)[0] + tm.assert_frame_equal(expected, res) + + def test_header_should_be_inferred_from_th_elements(self): + # GH17054 + expected = """ + + + + + + + + + + + + + + + + + +
XXYZW
12345
""" + out = """ + + + + + + + + + + + + + +
XXYZW
12345
""" + expected = self.read_html(expected)[0] # header is explicit + res = self.read_html(out)[0] # infer header + tm.assert_frame_equal(expected, res) + res2 = self.read_html(out, header=0)[0] # manually set header + tm.assert_frame_equal(expected, res2) + def test_parse_dates_list(self): df = DataFrame({'date': date_range('1/1/2001', periods=10)}) expected = df.to_html()