Skip to content

Commit

Permalink
Use html5lib to parse HTML instead of using regexs
Browse files Browse the repository at this point in the history
HTML is not a regular language and thus cannot be accurately parsed
with a regex. The regexs that pip had were gnarly, and required
multiple levels of them in order to "work".

Using html5lib instead of regexs makes the parsing step simpler,
better tested, and much more robust.

There is still some use of regex to parse html. Namely the hack
that looks for a <th> tag with a "rel" after it. This hack is
required to work inside of HTML comments so it cannot be parsed
as HTML.
  • Loading branch information
dstufft committed Jun 2, 2013
1 parent cba4521 commit 2eb43b2
Showing 1 changed file with 22 additions and 23 deletions.
45 changes: 22 additions & 23 deletions pip/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
from pip.download import urlopen, path_to_url2, url_to_path, geturl, Urllib2HeadRequest
from pip.wheel import Wheel, wheel_ext, wheel_distribute_support, distribute_requirement
from pip.pep425tags import supported_tags
from pip.vendor import html5lib

__all__ = ['PackageFinder']

Expand Down Expand Up @@ -475,13 +476,11 @@ class HTMLPage(object):
## FIXME: these regexes are horrible hacks:
_homepage_re = re.compile(r'<th>\s*home\s*page', re.I)
_download_re = re.compile(r'<th>\s*download\s+url', re.I)
## These aren't so aweful:
_rel_re = re.compile("""<[^>]*\srel\s*=\s*['"]?([^'">]+)[^>]*>""", re.I)
_href_re = re.compile('href=(?:"([^"]*)"|\'([^\']*)\'|([^>\\s\\n]*))', re.I|re.S)
_base_re = re.compile(r"""<base\s+href\s*=\s*['"]?([^'">]+)""", re.I)

def __init__(self, content, url, headers=None):
self.content = content
self.parsed = html5lib.parse(self.content, namespaceHTMLElements=False)
self.url = url
self.headers = headers

Expand Down Expand Up @@ -602,20 +601,21 @@ def _get_content_type(url):
@property
def base_url(self):
if not hasattr(self, "_base_url"):
match = self._base_re.search(self.content)
if match:
self._base_url = match.group(1)
base = self.parsed.find(".//base")
if base is not None and base.get("href"):
self._base_url = base.get("href")
else:
self._base_url = self.url
return self._base_url

@property
def links(self):
"""Yields all links in the page"""
for match in self._href_re.finditer(self.content):
url = match.group(1) or match.group(2) or match.group(3)
url = self.clean_link(urlparse.urljoin(self.base_url, url))
yield Link(url, self)
for anchor in self.parsed.findall(".//a"):
if anchor.get("href"):
href = anchor.get("href")
url = self.clean_link(urlparse.urljoin(self.base_url, href))
yield Link(url, self)

def rel_links(self):
for url in self.explicit_rel_links():
Expand All @@ -625,21 +625,20 @@ def rel_links(self):

def explicit_rel_links(self, rels=('homepage', 'download')):
"""Yields all links with the given relations"""
for match in self._rel_re.finditer(self.content):
found_rels = match.group(1).lower().split()
for rel in rels:
if rel in found_rels:
break
else:
continue
match = self._href_re.search(match.group(0))
if not match:
continue
url = match.group(1) or match.group(2) or match.group(3)
url = self.clean_link(urlparse.urljoin(self.base_url, url))
yield Link(url, self)
rels = set(rels)

This comment has been minimized.

Copy link
@jezdez

jezdez Jun 2, 2013

Member

Hm, aren't sets unordered?

This comment has been minimized.

Copy link
@dstufft

dstufft Jun 2, 2013

Author Member

Yes, but that doesn't matter here. We are only using this list to test for membership (is the rel of this link one of the rels I'm looking for).

Using sets allows us to use an intersection instead of needing to loop over all of the possible rels.

This comment has been minimized.

Copy link
@jezdez

jezdez Jun 2, 2013

Member

Ah, excellent.


for anchor in self.parsed.findall(".//a"):
if anchor.get("rel") and anchor.get("href"):
found_rels = set(anchor.get("rel").split())
# Determine the intersection between what rels were found and
# what rels were being looked for
if found_rels & rels:
href = anchor.get("href")
url = self.clean_link(urlparse.urljoin(self.base_url, href))
yield Link(url, self)

def scraped_rel_links(self):
# Can we get rid of this horrible horrible method?
for regex in (self._homepage_re, self._download_re):
match = regex.search(self.content)
if not match:
Expand Down

0 comments on commit 2eb43b2

Please sign in to comment.