From 3e5879440a6b513233f71f62de9234f67125dee7 Mon Sep 17 00:00:00 2001
From: Adam Hooper <adam@adamhooper.com>
Date: Thu, 14 Jun 2018 13:09:44 -0400
Subject: [PATCH] Handle colspan and rowspan

This is essentially a rebased and squashed #17054 (mad props to @jowens
for doing all the hard thinking). My tweaks:

* test_computer_sales_page (see #17074) no longer tests for ParserError,
  because the ParserError was a bug caused by missing colspan support.
  Now, test that MultiIndex works as expected.
* I respectfully removed the fill_rowspan argument from #17073. Instead,
  the virtual cells created by rowspan/colspan are always copies of the
  real cells' text. This prevents _infer_columns() from naming virtual
  cells as "Unnamed: ..."
* I removed a small layer of abstraction to respect #20891 (multiple
  <tbody> support), which was implemented after @jowens' pull request.
  Now _HtmlFrameParser has _parse_thead_trs, _parse_tbody_trs and
  _parse_tfoot_trs, each returning a list of <tr>s. That let me remove
  _parse_tr, Making All The Tests Pass.
* That caused a snowball effect. lxml does not fix malformed <thead>, as
  tested by spam.html. The previous hacky workaround was in
  _parse_raw_thead, but the new _parse_thead_trs signature returns nodes
  instead of text. The new hacky solution: return the <thead> itself,
  pretending it's a <tr>. This works in all the tests. A better solution
  is to use html5lib with lxml; but that might belong in a separate pull
  request.
---
 doc/source/whatsnew/v0.24.0.txt |   3 +-
 pandas/io/html.py               | 368 ++++++++++++++++++--------------
 pandas/tests/io/test_html.py    | 210 ++++++++++++++++--
 3 files changed, 405 insertions(+), 176 deletions(-)
diff --git a/doc/source/whatsnew/v0.24.0.txt b/doc/source/whatsnew/v0.24.0.txt
index abf574ae109fd..0f0ad3452e934 100644
--- a/doc/source/whatsnew/v0.24.0.txt
+++ b/doc/source/whatsnew/v0.24.0.txt
@@ -24,6 +24,7 @@ Other Enhancements
   <https://pandas-gbq.readthedocs.io/en/latest/changelog.html#changelog-0-5-0>`__.
   (:issue:`21627`)
 - New method :meth:`HDFStore.walk` will recursively walk the group hierarchy of an HDF5 file (:issue:`10932`)
+- :func:`read_html` handles colspan and rowspan arguments and attempts to infer a header if the header is not explicitly specified (:issue:`17054`)
 -
 
 .. _whatsnew_0240.api_breaking:
@@ -223,7 +224,7 @@ MultiIndex
 I/O
 ^^^
 
--
+- :func:`read_html()` no longer ignores all-whitespace ``<tr>`` within ``<thead>`` when considering the ``skiprows`` and ``header`` arguments. Previously, users had to decrease their ``header`` and ``skiprows`` values on such tables to work around the issue. (:issue:`21641`)
 -
 -
 
diff --git a/pandas/io/html.py b/pandas/io/html.py
index 8fd876e85889f..6e774f1846b99 100644
--- a/pandas/io/html.py
+++ b/pandas/io/html.py
@@ -10,13 +10,11 @@
 
 from distutils.version import LooseVersion
 
-import numpy as np
-
 from pandas.core.dtypes.common import is_list_like
 from pandas.errors import EmptyDataError
 from pandas.io.common import _is_url, urlopen, _validate_header_arg
 from pandas.io.parsers import TextParser
-from pandas.compat import (lrange, lmap, u, string_types, iteritems,
+from pandas.compat import (lrange, lmap, lfilter, u, string_types, iteritems,
                            raise_with_traceback, binary_type)
 from pandas import Series
 import pandas.core.common as com
@@ -193,11 +191,11 @@ class _HtmlFrameParser(object):
         * :func:`_build_doc`
         * :func:`_text_getter`
         * :func:`_parse_td`
+        * :func:`_parse_thead_tr`
+        * :func:`_parse_tbody_tr`
+        * :func:`_parse_tfoot_tr`
         * :func:`_parse_tables`
-        * :func:`_parse_tr`
-        * :func:`_parse_thead`
-        * :func:`_parse_tbody`
-        * :func:`_parse_tfoot`
+        * :func:`_equals_tag`
     See each method's respective documentation for details on their
     functionality.
     """
@@ -210,32 +208,14 @@ def __init__(self, io, match, attrs, encoding, displayed_only):
         self.displayed_only = displayed_only
 
     def parse_tables(self):
-        tables = self._parse_tables(self._build_doc(), self.match, self.attrs)
-        return (self._build_table(table) for table in tables)
-
-    def _parse_raw_data(self, rows):
-        """Parse the raw data into a list of lists.
-
-        Parameters
-        ----------
-        rows : iterable of node-like
-            A list of row elements.
-
-        text_getter : callable
-            A callable that gets the text from an individual node. This must be
-            defined by subclasses.
-
-        column_finder : callable
-            A callable that takes a row node as input and returns a list of the
-            column node in that row. This must be defined by subclasses.
+        """Parse and return all tables from the DOM.
 
         Returns
         -------
-        data : list of list of strings
+        tables : list of parsed (header, body, footer) tuples from tables
         """
-        data = [[_remove_whitespace(self._text_getter(col)) for col in
-                 self._parse_td(row)] for row in rows]
-        return data
+        tables = self._parse_tables(self._build_doc(), self.match, self.attrs)
+        return (self._parse_thead_tbody_tfoot(table) for table in tables)
 
     def _text_getter(self, obj):
         """Return the text of an individual DOM node.
@@ -257,7 +237,7 @@ def _parse_td(self, obj):
 
         Parameters
         ----------
-        obj : node-like
+        obj : an HTML row element
 
         Returns
         -------
@@ -266,90 +246,88 @@ def _parse_td(self, obj):
         """
         raise com.AbstractMethodError(self)
 
-    def _parse_tables(self, doc, match, attrs):
-        """Return all tables from the parsed DOM.
+    def _parse_thead_tr(self, table):
+        """Return the list of thead row elements from the parsed table element.
 
         Parameters
         ----------
-        doc : tree-like
-            The DOM from which to parse the table element.
-
-        match : str or regular expression
-            The text to search for in the DOM tree.
-
-        attrs : dict
-            A dictionary of table attributes that can be used to disambiguate
-            multiple tables on a page.
-
-        Raises
-        ------
-        ValueError
-            * If `match` does not match any text in the document.
+        table : a table element that contains zero or more thead elements.
 
         Returns
         -------
-        tables : list of node-like
-            A list of <table> elements to be parsed into raw data.
+        rows : list of <tr> row elements of a table
         """
         raise com.AbstractMethodError(self)
 
-    def _parse_tr(self, table):
-        """Return the list of row elements from the parsed table element.
+    def _parse_tbody_tr(self, table):
+        """Return the list of tbody row elements from the parsed table element.
+
+        HTML5 table bodies consist of either 0 or more <tbody> elements (which
+        only contain <tr> elements) or 0 or more <tr> elements. This method
+        checks for both structures.
 
         Parameters
         ----------
-        table : node-like
-            A table element that contains row elements.
+        table : a table element that contains row elements.
 
         Returns
         -------
-        rows : list of node-like
-            A list row elements of a table, usually <tr> or <th> elements.
+        rows : list of <tr> row elements of a table
         """
         raise com.AbstractMethodError(self)
 
-    def _parse_thead(self, table):
-        """Return the header of a table.
+    def _parse_tfoot_tr(self, table):
+        """Return the list of tfoot row elements from the parsed table element.
 
         Parameters
         ----------
-        table : node-like
-            A table element that contains row elements.
+        table : a table element that contains row elements.
 
         Returns
         -------
-        thead : node-like
-            A <thead>...</thead> element.
+        rows : list of <tr> row elements of a table
         """
         raise com.AbstractMethodError(self)
 
-    def _parse_tbody(self, table):
-        """Return the list of tbody elements from the parsed table element.
+    def _parse_tables(self, doc, match, attrs):
+        """Return all tables from the parsed DOM.
 
         Parameters
         ----------
-        table : node-like
-            A table element that contains row elements.
+        doc : the DOM from which to parse the table element.
+
+        match : str or regular expression
+            The text to search for in the DOM tree.
+
+        attrs : dict
+            A dictionary of table attributes that can be used to disambiguate
+            multiple tables on a page.
+
+        Raises
+        ------
+        ValueError : `match` does not match any text in the document.
 
         Returns
         -------
-        tbodys : list of node-like
-            A list of <tbody>...</tbody> elements
+        tables : list of HTML <table> elements to be parsed into raw data.
         """
         raise com.AbstractMethodError(self)
 
-    def _parse_tfoot(self, table):
-        """Return the footer of the table if any.
+    def _equals_tag(self, obj, tag):
+        """Return whether an individual DOM node matches a tag
 
         Parameters
         ----------
-        table : node-like
-            A table element that contains row elements.
+        obj : node-like
+            A DOM node.
+
+        tag : str
+            Tag name to be checked for equality
 
         Returns
         -------
-        tfoot : node-like
-            A <tfoot>...</tfoot> element.
+        is_tag_equal : boolean
+            boolean indicating if the object is equal to tag 'tag'
         """
         raise com.AbstractMethodError(self)
 
@@ -358,47 +336,115 @@ def _build_doc(self):
 
         Returns
         -------
-        obj : tree-like
+        obj : the DOM from which to parse the table element.
         """
         raise com.AbstractMethodError(self)
 
-    def _build_table(self, table):
-        header = self._parse_raw_thead(table)
-        body = self._parse_raw_tbody(table)
-        footer = self._parse_raw_tfoot(table)
-        return header, body, footer
+    def _parse_thead_tbody_tfoot(self, table_html):
+        """Given a table, return parsed header, body, and foot.
+           Header and body are lists-of-lists. Top level list is a list of
+           rows. Each row is a list of parsed elements.
 
-    def _parse_raw_thead(self, table):
-        thead = self._parse_thead(table)
-        res = []
-        if thead:
-            trs = self._parse_tr(thead[0])
-            for tr in trs:
-                cols = lmap(self._text_getter, self._parse_td(tr))
-                if any(col != '' for col in cols):
-                    res.append(cols)
-        return res
+           Logic: Use <thead>, <tbody>, <tfoot> elements to identify
+                  header, body, and footer, otherwise:
+                  - Put all rows into body
+                  - Move rows from top of body to header only if
+                    all elements inside row are <th>
+                  - Move rows from bottom of body to footer only if
+                    all elements inside row are <th>
 
-    def _parse_raw_tfoot(self, table):
-        tfoot = self._parse_tfoot(table)
-        res = []
-        if tfoot:
-            res = lmap(self._text_getter, self._parse_td(tfoot[0]))
-        return np.atleast_1d(
-            np.array(res).squeeze()) if res and len(res) == 1 else res
+        Parameters
+        ----------
+        table_html : a single HTML table element.
 
-    def _parse_raw_tbody(self, table):
-        tbodies = self._parse_tbody(table)
+        Returns
+        -------
+        tuple of (header, body, footer)
+        header : list of rows, each of which is a list of parsed
+                 header elements
+        body : list of rows, each of which is a list of parsed body elements
+        footer : list of rows, each of which is a list of parsed
+                 footer elements
+        """
 
-        raw_data = []
+        header_rows = self._parse_thead_tr(table_html)
+        body_rows = self._parse_tbody_tr(table_html)
+        footer_rows = self._parse_tfoot_tr(table_html)
 
-        if tbodies:
-            for tbody in tbodies:
-                raw_data.extend(self._parse_tr(tbody))
-        else:
-            raw_data.extend(self._parse_tr(table))
+        if not header_rows:
+            # The table has no <thead>. Treat first all-<th> rows as headers.
+            while body_rows and all(self._equals_tag(t, 'th') for t in
+                                    self._parse_td(body_rows[0])):
+                # this row should be a header row, move it from body to header
+                header_rows.append(body_rows.pop(0))
+
+        if not footer_rows:
+            # The table has no <tfoot>. Treat last all-<th> rows as footers.
+            while body_rows and all(self._equals_tag(t, 'th') for t in
+                                    self._parse_td(body_rows[-1])):
+                # this row should be a footer row, move it from body to footer
+                footer_rows.insert(0, body_rows.pop())
+
+        header = self._expand_colspan_rowspan(header_rows)
+        body = self._expand_colspan_rowspan(body_rows)
+        footer = self._expand_colspan_rowspan(footer_rows)
+
+        return header, body, footer
+
+    def _expand_colspan_rowspan(self, rows):
+        """Given a list of <tr>s, return a list of text rows that copy cell
+           text across rowspans/colspans.
 
-        return self._parse_raw_data(raw_data)
+        Parameters
+        ----------
+        rows : list of <tr>s
+
+        Returns
+        -------
+        res : list of rows, each of which is a list of str in that row
+        """
+
+        res = []
+        saved_span = []
+        for row in rows:
+            extracted_row = self._parse_td(row)
+            cols_text = [_remove_whitespace(
+                self._text_getter(col)) for col in extracted_row]
+            col_colspans = [int(col.get('colspan', 1))
+                            for col in extracted_row]
+            col_rowspans = [int(col.get('rowspan', 1))
+                            for col in extracted_row]
+            # expand cols using col_colspans
+            # maybe this can be done with a list comprehension, dunno
+            cols = list(zip(
+                list(com.flatten(
+                    lmap(lambda text_nc: [text_nc[0]] * text_nc[1],
+                         list(zip(cols_text, col_colspans))))),
+                list(com.flatten(
+                    lmap(lambda nc_nr: [nc_nr[1]] * nc_nr[0],
+                         list(zip(col_colspans, col_rowspans))))))
+            )
+            # cols is now a list of (text, number of rows)
+            # now insert any previous rowspans
+            for (col, (text, nr)) in saved_span:
+                cols.insert(col, (text, nr))
+
+            # save next saved_span
+            def advance_item_to_next_row(item):
+                (col, (text, nr)) = item
+                if nr == 1:
+                    return None
+                else:
+                    return (col, (text, nr - 1))
+            saved_span = lfilter(lambda i: i is not None,
+                                 lmap(advance_item_to_next_row,
+                                      list(enumerate(cols))))
+            cols = [text for (text, nr) in cols]
+            # generate cols with text only
+            if any([col != '' for col in cols]):
+                res.append(cols)
+
+        return res
 
     def _handle_hidden_tables(self, tbl_list, attr_name):
         """Returns list of tables, potentially removing hidden elements
@@ -442,27 +488,6 @@ def __init__(self, *args, **kwargs):
         from bs4 import SoupStrainer
         self._strainer = SoupStrainer('table')
 
-    def _text_getter(self, obj):
-        return obj.text
-
-    def _parse_td(self, row):
-        return row.find_all(('td', 'th'))
-
-    def _parse_tr(self, element):
-        return element.find_all('tr')
-
-    def _parse_th(self, element):
-        return element.find_all('th')
-
-    def _parse_thead(self, table):
-        return table.find_all('thead')
-
-    def _parse_tbody(self, table):
-        return table.find_all('tbody')
-
-    def _parse_tfoot(self, table):
-        return table.find_all('tfoot')
-
     def _parse_tables(self, doc, match, attrs):
         element_name = self._strainer.name
         tables = doc.find_all(element_name, attrs=attrs)
@@ -490,6 +515,27 @@ def _parse_tables(self, doc, match, attrs):
                              .format(patt=match.pattern))
         return result
 
+    def _text_getter(self, obj):
+        return obj.text
+
+    def _equals_tag(self, obj, tag):
+        return obj.name == tag
+
+    def _parse_td(self, row):
+        return row.find_all(('td', 'th'), recursive=False)
+
+    def _parse_thead_tr(self, table):
+        return table.select('thead tr')
+
+    def _parse_tbody_tr(self, table):
+        from_tbody = table.select('tbody tr')
+        from_root = table.find_all('tr', recursive=False)
+        # HTML spec: at most one of these lists has content
+        return from_tbody + from_root
+
+    def _parse_tfoot_tr(self, table):
+        return table.select('tfoot tr')
+
     def _setup_build_doc(self):
         raw_text = _read(self.io)
         if not raw_text:
@@ -554,10 +600,9 @@ def _text_getter(self, obj):
         return obj.text_content()
 
     def _parse_td(self, row):
-        return row.xpath('.//td|.//th')
-
-    def _parse_tr(self, table):
-        return table.xpath('.//tr')
+        # Look for direct descendents only: the "row" element here may be a
+        # <thead> or <tfoot> (see _parse_thead_tr).
+        return row.xpath('./td|./th')
 
     def _parse_tables(self, doc, match, kwargs):
         pattern = match.pattern
@@ -590,6 +635,12 @@ def _parse_tables(self, doc, match, kwargs):
                              .format(patt=pattern))
         return tables
 
+    def _equals_tag(self, obj, tag):
+        return obj.tag == tag
+
+    def _contains_tag(self, obj, tag):
+        return obj.find(tag) is not None
+
     def _build_doc(self):
         """
         Raises
@@ -637,41 +688,30 @@ def _build_doc(self):
                 raise XMLSyntaxError("no text parsed from document", 0, 0, 0)
         return r
 
-    def _parse_tbody(self, table):
-        return table.xpath('.//tbody')
+    def _parse_thead_tr(self, table):
+        rows = []
 
-    def _parse_thead(self, table):
-        return table.xpath('.//thead')
+        for thead in table.xpath('.//thead'):
+            rows.extend(thead.xpath('./tr'))
 
-    def _parse_tfoot(self, table):
-        return table.xpath('.//tfoot')
+            # lxml does not clean up the clearly-erroneous
+            # <thead><th>foo</th><th>bar</th></thead>.
+            elements_at_root = thead.xpath('./td|./th')
+            if elements_at_root:
+                # Pass the entire <thead> as a row. _parse_td() will interpret
+                # it correctly.
+                rows.append(thead)
 
-    def _parse_raw_thead(self, table):
-        expr = './/thead'
-        thead = table.xpath(expr)
-        res = []
-        if thead:
-            # Grab any directly descending table headers first
-            ths = thead[0].xpath('./th')
-            if ths:
-                cols = [_remove_whitespace(x.text_content()) for x in ths]
-                if any(col != '' for col in cols):
-                    res.append(cols)
-            else:
-                trs = self._parse_tr(thead[0])
+        return rows
 
-                for tr in trs:
-                    cols = [_remove_whitespace(x.text_content()) for x in
-                            self._parse_td(tr)]
+    def _parse_tbody_tr(self, table):
+        from_tbody = table.xpath('.//tbody//tr')
+        from_root = table.xpath('./tr')
+        # HTML spec: at most one of these lists has content
+        return from_tbody + from_root
 
-                    if any(col != '' for col in cols):
-                        res.append(cols)
-        return res
-
-    def _parse_raw_tfoot(self, table):
-        expr = './/tfoot//th|//tfoot//td'
-        return [_remove_whitespace(x.text_content()) for x in
-                table.xpath(expr)]
+    def _parse_tfoot_tr(self, table):
+        return table.xpath('.//tfoot//tr')
 
 
 def _expand_elements(body):
@@ -695,7 +735,7 @@ def _data_to_frame(**kwargs):
             header = 0 if rows == [0] else rows
 
     if foot:
-        body += [foot]
+        body += foot
 
     # fill out elements of body that are "ragged"
     _expand_elements(body)
@@ -953,7 +993,13 @@ def read_html(io, match='.+', flavor=None, header=None, index_col=None,
 
     This function searches for ``<table>`` elements and only for ``<tr>``
     and ``<th>`` rows and ``<td>`` elements within each ``<tr>`` or ``<th>``
-    element in the table. ``<td>`` stands for "table data".
+    element in the table. ``<td>`` stands for "table data". This function
+    attempts to properly handle ``colspan`` and ``rowspan`` attributes.
+    If the function has a ``<thead>`` argument, it is used to construct
+    the header, otherwise the function attempts to find the header within
+    the body (by putting rows with only ``<th>`` elements into the header).
+
+        .. versionadded:: 0.21.0
 
     Similar to :func:`~pandas.read_csv` the `header` argument is applied
     **after** `skiprows` is applied.
diff --git a/pandas/tests/io/test_html.py b/pandas/tests/io/test_html.py
index 9c6a8de7ed446..b8f520ee17d72 100644
--- a/pandas/tests/io/test_html.py
+++ b/pandas/tests/io/test_html.py
@@ -18,7 +18,6 @@
 from pandas.io.common import URLError, file_path_to_url
 import pandas.io.html
 from pandas.io.html import read_html
-from pandas._libs.parsers import ParserError
 
 import pandas.util.testing as tm
 import pandas.util._test_decorators as td
@@ -129,16 +128,7 @@ def test_banklist(self):
 
         assert_framelist_equal(df1, df2)
 
-    def test_spam_no_types(self):
-
-        # infer_types removed in #10892
-        df1 = self.read_html(self.spam_data, '.*Water.*')
-        df2 = self.read_html(self.spam_data, 'Unit')
-        assert_framelist_equal(df1, df2)
-        assert df1[0].iloc[0, 0] == 'Proximates'
-        assert df1[0].columns[0] == 'Nutrient'
-
-    def test_spam_with_types(self):
+    def test_spam(self):
         df1 = self.read_html(self.spam_data, '.*Water.*')
         df2 = self.read_html(self.spam_data, 'Unit')
         assert_framelist_equal(df1, df2)
@@ -372,7 +362,7 @@ def test_thousands_macau_stats(self, datapath):
                              attrs={'class': 'style1'})
         df = dfs[all_non_nan_table_index]
 
-        assert not any(s.isna().any() for _, s in df.iteritems())
+        assert not any(s.isnull().any() for _, s in df.iteritems())
 
     @pytest.mark.slow
     def test_thousands_macau_index_col(self, datapath):
@@ -381,7 +371,7 @@ def test_thousands_macau_index_col(self, datapath):
         dfs = self.read_html(macau_data, index_col=0, header=0)
         df = dfs[all_non_nan_table_index]
 
-        assert not any(s.isna().any() for _, s in df.iteritems())
+        assert not any(s.isnull().any() for _, s in df.iteritems())
 
     def test_empty_tables(self):
         """
@@ -461,6 +451,44 @@ def test_header_and_one_column(self):
         result = self.read_html(data)[0]
         tm.assert_frame_equal(result, expected)
 
+    def test_thead_without_tr(self):
+        """
+        Ensure parser adds <tr> within <thead> on malformed HTML.
+        """
+        data1 = StringIO('''<table>
+            <thead>
+                <tr>
+                    <th>Country</th>
+                    <th>Municipality</th>
+                    <th>Year</th>
+                </tr>
+            </thead>
+            <tbody>
+                <tr>
+                    <td>Ukraine</td>
+                    <th>Odessa</th>
+                    <td>1944</td>
+                </tr>
+            </tbody>
+        </table>''')
+        data2 = StringIO('''<table>
+            <thead>
+                <th>Country</th>
+                <th>Municipality</th>
+                <th>Year</th>
+            </thead>
+            <tbody>
+                <tr>
+                    <td>Ukraine</td>
+                    <th>Odessa</th>
+                    <td>1944</td>
+                </tr>
+            </tbody>
+        </table>''')
+        res1 = self.read_html(data1)
+        res2 = self.read_html(data2, header=0)
+        assert_framelist_equal(res1, res2)
+
     def test_tfoot_read(self):
         """
         Make sure that read_html reads tfoot, containing td or th.
@@ -592,7 +620,7 @@ def test_gold_canyon(self):
                             attrs={'id': 'table'})[0]
         assert gc in df.to_string()
 
-    def test_different_number_of_rows(self):
+    def test_different_number_of_cols(self):
         expected = """<table border="1" class="dataframe">
                         <thead>
                             <tr style="text-align: right;">
@@ -654,6 +682,160 @@ def test_different_number_of_rows(self):
         res = self.read_html(out, index_col=0)[0]
         tm.assert_frame_equal(expected, res)
 
+    def test_colspan_rowspan_are_1(self):
+        # GH17054
+        expected = """<table>
+                        <thead>
+                            <tr>
+                            <th>X</th>
+                            <th>Y</th>
+                            <th>Z</th>
+                            <th>W</th>
+                            </tr>
+                        </thead>
+                        <tbody>
+                        </tbody>
+                    </table>"""
+        out = """<table>
+                   <thead>
+                       <tr>
+                       <th colspan="1">X</th>
+                       <th>Y</th>
+                       <th rowspan="1">Z</th>
+                       <th>W</th>
+                       </tr>
+                   </thead>
+                   <tbody>
+                   </tbody>
+               </table>"""
+        expected = self.read_html(expected)[0]
+        res = self.read_html(out)[0]
+        tm.assert_frame_equal(expected, res)
+
+    def test_colspan_rowspan_are_more_than_1(self):
+        # GH17054
+        expected = """<table>
+                        <thead>
+                            <tr>
+                            <th>X</th>
+                            <th>X</th>
+                            <th>Y</th>
+                            <th>Z</th>
+                            <th>W</th>
+                            </tr>
+                            <tr>
+                            <th>1</th>
+                            <th>2</th>
+                            <th>2</th>
+                            <th>Z</th>
+                            <th>3</th>
+                            </tr>
+                        </thead>
+                        <tbody>
+                        </tbody>
+                    </table>"""
+        out = """<table>
+                   <thead>
+                       <tr>
+                       <th colspan="2">X</th>
+                       <th>Y</th>
+                       <th rowspan="2">Z</th>
+                       <th>W</th>
+                       </tr>
+                       <tr>
+                       <th>1</th>
+                       <th colspan="2">2</th>
+                       <th>3</th>
+                       </tr>
+                   </thead>
+                   <tbody>
+                   </tbody>
+               </table>"""
+        expected = self.read_html(expected)[0]
+        res = self.read_html(out)[0]
+        tm.assert_frame_equal(expected, res)
+
+    def test_tbody_colspan_rowspan_copy_values(self):
+        # GH17054
+        expected = """<table>
+                        <tbody>
+                            <tr>
+                            <td>1</td>
+                            <td>1</td>
+                            <td>2</td>
+                            <td>3</td>
+                            <td>4</td>
+                            </tr>
+                            <tr>
+                            <td>5</td>
+                            <td>6</td>
+                            <td>6</td>
+                            <td>3</td>
+                            <td>7</td>
+                            </tr>
+                        </tbody>
+                    </table>"""
+        out = """<table>
+                   <tbody>
+                       <tr>
+                       <td colspan="2">1</td>
+                       <td>2</td>
+                       <td rowspan="2">3</td>
+                       <td>4</td>
+                       </tr>
+                       <tr>
+                       <td>5</td>
+                       <td colspan="2">6</td>
+                       <td>7</td>
+                       </tr>
+                   </tbody>
+               </table>"""
+        expected = self.read_html(expected)[0]
+        res = self.read_html(out)[0]
+        tm.assert_frame_equal(expected, res)
+
+    def test_header_should_be_inferred_from_th_elements(self):
+        # GH17054
+        expected = """<table>
+                        <thead>
+                            <tr>
+                            <th>X</th>
+                            <th>X</th>
+                            <th>Y</th>
+                            <th>Z</th>
+                            <th>W</th>
+                            </tr>
+                        </thead>
+                        <tbody>
+                            <tr>
+                            <td>1</td>
+                            <td>2</td>
+                            <td>3</td>
+                            <td>4</td>
+                            <td>5</td>
+                        </tbody>
+                    </table>"""
+        out = """<table>
+                            <tr>
+                            <th>X</th>
+                            <th>X</th>
+                            <th>Y</th>
+                            <th>Z</th>
+                            <th>W</th>
+                            </tr>
+                            <tr>
+                            <td>1</td>
+                            <td>2</td>
+                            <td>3</td>
+                            <td>4</td>
+                            <td>5</td>
+                    </table>"""
+        expected = self.read_html(expected)[0]  # header is explicit
+        res = self.read_html(out)[0]            # infer header
+        tm.assert_frame_equal(expected, res)
+        res2 = self.read_html(out, header=0)[0]  # manually set header
+        tm.assert_frame_equal(expected, res2)
+
     def test_parse_dates_list(self):
         df = DataFrame({'date': date_range('1/1/2001', periods=10)})
         expected = df.to_html()

+ - Move rows from bottom of body to footer only if + all elements inside row are	- def _parse_raw_tfoot(self, table): - tfoot = self._parse_tfoot(table) - res = [] - if tfoot: - res = lmap(self._text_getter, self._parse_td(tfoot[0])) - return np.atleast_1d( - np.array(res).squeeze()) if res and len(res) == 1 else res + Parameters + ---------- + table_html : a single HTML table element. - def _parse_raw_tbody(self, table): - tbodies = self._parse_tbody(table) + Returns + ------- + tuple of (header, body, footer) + header : list of rows, each of which is a list of parsed + header elements + body : list of rows, each of which is a list of parsed body elements + footer : list of rows, each of which is a list of parsed + footer elements + """ - raw_data = [] + header_rows = self._parse_thead_tr(table_html) + body_rows = self._parse_tbody_tr(table_html) + footer_rows = self._parse_tfoot_tr(table_html) - if tbodies: - for tbody in tbodies: - raw_data.extend(self._parse_tr(tbody)) - else: - raw_data.extend(self._parse_tr(table)) + if not header_rows: + # The table has no
rows as headers. + while body_rows and all(self._equals_tag(t, 'th') for t in + self._parse_td(body_rows[0])): + # this row should be a header row, move it from body to header + header_rows.append(body_rows.pop(0)) + + if not footer_rows: + # The table has no
rows as footers. + while body_rows and all(self._equals_tag(t, 'th') for t in + self._parse_td(body_rows[-1])): + # this row should be a footer row, move it from body to footer + footer_rows.insert(0, body_rows.pop()) + + header = self._expand_colspan_rowspan(header_rows) + body = self._expand_colspan_rowspan(body_rows) + footer = self._expand_colspan_rowspan(footer_rows) + + return header, body, footer + + def _expand_colspan_rowspan(self, rows): + """Given a list of
foo	bar
`` elements into the header). + + .. versionadded:: 0.21.0 Similar to :func:`~pandas.read_csv` the `header` argument is applied after `skiprows` is applied. diff --git a/pandas/tests/io/test_html.py b/pandas/tests/io/test_html.py index 9c6a8de7ed446..b8f520ee17d72 100644 --- a/pandas/tests/io/test_html.py +++ b/pandas/tests/io/test_html.py @@ -18,7 +18,6 @@ from pandas.io.common import URLError, file_path_to_url import pandas.io.html from pandas.io.html import read_html -from pandas._libs.parsers import ParserError import pandas.util.testing as tm import pandas.util._test_decorators as td @@ -129,16 +128,7 @@ def test_banklist(self): assert_framelist_equal(df1, df2) - def test_spam_no_types(self): - - # infer_types removed in #10892 - df1 = self.read_html(self.spam_data, '.Water.') - df2 = self.read_html(self.spam_data, 'Unit') - assert_framelist_equal(df1, df2) - assert df1[0].iloc[0, 0] == 'Proximates' - assert df1[0].columns[0] == 'Nutrient' - - def test_spam_with_types(self): + def test_spam(self): df1 = self.read_html(self.spam_data, '.Water.') df2 = self.read_html(self.spam_data, 'Unit') assert_framelist_equal(df1, df2) @@ -372,7 +362,7 @@ def test_thousands_macau_stats(self, datapath): attrs={'class': 'style1'}) df = dfs[all_non_nan_table_index] - assert not any(s.isna().any() for _, s in df.iteritems()) + assert not any(s.isnull().any() for _, s in df.iteritems()) @pytest.mark.slow def test_thousands_macau_index_col(self, datapath): @@ -381,7 +371,7 @@ def test_thousands_macau_index_col(self, datapath): dfs = self.read_html(macau_data, index_col=0, header=0) df = dfs[all_non_nan_table_index] - assert not any(s.isna().any() for _, s in df.iteritems()) + assert not any(s.isnull().any() for _, s in df.iteritems()) def test_empty_tables(self): """ @@ -461,6 +451,44 @@ def test_header_and_one_column(self): result = self.read_html(data)[0] tm.assert_frame_equal(result, expected) + def test_thead_without_tr(self): + """ + Ensure parser adds
`` rows and ``	`` elements within each ``
`` - element in the table. ``	`` stands for "table data". + element in the table. ``	`` stands for "table data". This function + attempts to properly handle ``colspan`` and ``rowspan`` attributes. + If the function has a ``