diff --git a/pandas/io/html.py b/pandas/io/html.py index 8a73c786825e3..10701be4f7e0b 100644 --- a/pandas/io/html.py +++ b/pandas/io/html.py @@ -23,6 +23,7 @@ AbstractMethodError, EmptyDataError, ) +from pandas.util._decorators import doc from pandas.util._exceptions import find_stack_level from pandas.util._validators import check_dtype_backend @@ -32,6 +33,7 @@ from pandas.core.indexes.base import Index from pandas.core.indexes.multi import MultiIndex from pandas.core.series import Series +from pandas.core.shared_docs import _shared_docs from pandas.io.common import ( file_exists, @@ -363,13 +365,13 @@ def _parse_tfoot_tr(self, table): """ raise AbstractMethodError(self) - def _parse_tables(self, doc, match, attrs): + def _parse_tables(self, document, match, attrs): """ Return all tables from the parsed DOM. Parameters ---------- - doc : the DOM from which to parse the table element. + document : the DOM from which to parse the table element. match : str or regular expression The text to search for in the DOM tree. @@ -594,9 +596,9 @@ def __init__(self, *args, **kwargs) -> None: self._strainer = SoupStrainer("table") - def _parse_tables(self, doc, match, attrs): + def _parse_tables(self, document, match, attrs): element_name = self._strainer.name - tables = doc.find_all(element_name, attrs=attrs) + tables = document.find_all(element_name, attrs=attrs) if not tables: raise ValueError("No tables found") @@ -726,7 +728,7 @@ def _parse_td(self, row): # or (see _parse_thead_tr). return row.xpath("./td|./th") - def _parse_tables(self, doc, match, kwargs): + def _parse_tables(self, document, match, kwargs): pattern = match.pattern # 1. check all descendants for the given pattern and only search tables @@ -738,7 +740,7 @@ def _parse_tables(self, doc, match, kwargs): if kwargs: xpath_expr += _build_xpath_expr(kwargs) - tables = doc.xpath(xpath_expr, namespaces=_re_namespace) + tables = document.xpath(xpath_expr, namespaces=_re_namespace) tables = self._handle_hidden_tables(tables, "attrib") if self.displayed_only: @@ -1026,6 +1028,7 @@ def _parse( return ret +@doc(storage_options=_shared_docs["storage_options"]) def read_html( io: FilePath | ReadBuffer[str], *, @@ -1096,13 +1099,13 @@ def read_html( passed to lxml or Beautiful Soup. However, these attributes must be valid HTML table attributes to work correctly. For example, :: - attrs = {'id': 'table'} + attrs = {{'id': 'table'}} is a valid attribute dictionary because the 'id' HTML tag attribute is a valid HTML attribute for *any* HTML tag as per `this document `__. :: - attrs = {'asdf': 'table'} + attrs = {{'asdf': 'table'}} is *not* a valid attribute dictionary because 'asdf' is not a valid HTML attribute even if it is a valid XML attribute. Valid HTML 4.01 @@ -1144,13 +1147,13 @@ def read_html( displayed_only : bool, default True Whether elements with "display: none" should be parsed. - extract_links : {None, "all", "header", "body", "footer"} + extract_links : {{None, "all", "header", "body", "footer"}} Table elements in the specified section(s) with tags will have their href extracted. .. versionadded:: 1.5.0 - dtype_backend : {'numpy_nullable', 'pyarrow'}, default 'numpy_nullable' + dtype_backend : {{'numpy_nullable', 'pyarrow'}}, default 'numpy_nullable' Back-end data type applied to the resultant :class:`DataFrame` (still experimental). Behaviour is as follows: @@ -1161,6 +1164,10 @@ def read_html( .. versionadded:: 2.0 + {storage_options} + + .. versionadded:: 2.1.0 + Returns ------- dfs