Skip to content

Commit 7087f0a

Browse files
Global functions extract and urlsplit now use a singleton TLDExtract class instance. The global functions now take only the url param; instantiate your own TLDExtract to set the other 2 params.
* Deprecate tldextract.urlsplit. Client can easily call urlparse.urlsplit before calling tldextract and achieve the same functionality. * Clean up whitespace.
1 parent 5b16c81 commit 7087f0a

File tree

1 file changed

+100
-92
lines changed

1 file changed

+100
-92
lines changed

tldextract/tldextract.py

Lines changed: 100 additions & 92 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@
2424
import cPickle as pickle
2525
except ImportError:
2626
import pickle
27+
from functools import wraps
2728
import logging
2829
from operator import itemgetter
2930
import os
@@ -45,146 +46,153 @@ def resource_stream(cls, package, resource_name):
4546
import socket
4647
import urllib2
4748
import urlparse
49+
import warnings
4850

4951
LOG = logging.getLogger(__file__)
5052

5153
SCHEME_RE = re.compile(r'^([' + urlparse.scheme_chars + ']+:)?//')
5254
IP_RE = re.compile(r'^(([0-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5])\.){3}([0-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5])$')
5355

5456
class ExtractResult(tuple):
55-
'ExtractResult(subdomain, domain, tld)'
56-
__slots__ = ()
57-
_fields = ('subdomain', 'domain', 'tld')
57+
'ExtractResult(subdomain, domain, tld)'
58+
__slots__ = ()
59+
_fields = ('subdomain', 'domain', 'tld')
5860

5961
def __new__(_cls, subdomain, domain, tld):
6062
'Create new instance of ExtractResult(subdomain, domain, tld)'
61-
return tuple.__new__(_cls, (subdomain, domain, tld))
63+
return tuple.__new__(_cls, (subdomain, domain, tld))
6264

6365
@classmethod
6466
def _make(cls, iterable, new=tuple.__new__, len=len):
6567
'Make a new ExtractResult object from a sequence or iterable'
6668
result = new(cls, iterable)
6769
if len(result) != 3:
6870
raise TypeError('Expected 3 arguments, got %d' % len(result))
69-
return result
71+
return result
7072

7173
def __repr__(self):
7274
'Return a nicely formatted representation string'
73-
return 'ExtractResult(subdomain=%r, domain=%r, tld=%r)' % self
75+
return 'ExtractResult(subdomain=%r, domain=%r, tld=%r)' % self
7476

7577
def _asdict(self):
7678
'Return a new dict which maps field names to their values'
77-
return dict(zip(self._fields, self))
79+
return dict(zip(self._fields, self))
7880

7981
def _replace(_self, **kwds):
8082
'Return a new ExtractResult object replacing specified fields with new values'
8183
result = _self._make(map(kwds.pop, ('subdomain', 'domain', 'tld'), _self))
8284
if kwds:
8385
raise ValueError('Got unexpected field names: %r' % kwds.keys())
84-
return result
86+
return result
8587

8688
def __getnewargs__(self):
8789
'Return self as a plain tuple. Used by copy and pickle.'
88-
return tuple(self)
90+
return tuple(self)
8991

9092
subdomain = property(itemgetter(0), doc='Alias for field number 0')
9193
domain = property(itemgetter(1), doc='Alias for field number 1')
9294
tld = property(itemgetter(2), doc='Alias for field number 2')
9395

94-
def extract(url, fetch=True, cache_file=''):
95-
"""
96-
Takes a string URL and splits it into its subdomain, domain, and
97-
gTLD/ccTLD component. Ignores scheme, username, and path components.
96+
class TLDExtract(object):
97+
def __init__(self, fetch=True, cache_file=''):
98+
"""
99+
Constructs a callable for extracting subdomain, domain, and TLD
100+
components from a URL.
98101
99-
If fetch is True (the default) and no cached TLD set is found, this module
100-
will fetch TLD sources live over HTTP on first use. Set to False to
101-
not make HTTP requests. Either way, if the TLD set can't be read, the
102-
module will fall back to the included TLD set snapshot.
102+
If fetch is True (the default) and no cached TLD set is found, this
103+
extractor will fetch TLD sources live over HTTP on first use. Set to
104+
False to not make HTTP requests. Either way, if the TLD set can't be
105+
read, the module will fall back to the included TLD set snapshot.
103106
104-
Specifying cache_file will override the location of the TLD set. Defaults
105-
to /path/to/tldextract/.tld_set.
107+
Specifying cache_file will override the location of the TLD set.
108+
Defaults to /path/to/tldextract/.tld_set.
106109
107-
>>> extract('http://forums.news.cnn.com/')
108-
ExtractResult(subdomain='forums.news', domain='cnn', tld='com')
109-
>>> extract('http://forums.bbc.co.uk/')
110-
ExtractResult(subdomain='forums', domain='bbc', tld='co.uk')
111-
"""
112-
netloc = SCHEME_RE.sub("", url).partition("/")[0]
113-
return _extract(netloc, fetch, cache_file)
110+
"""
111+
self.fetch = fetch
112+
self.cache_file = cache_file
113+
self._extractor = None
114114

115-
def urlsplit(url, fetch=True, cache_file=''):
116-
"""Same as `extract` but calls urlparse.urlsplit to further 'validate' the
117-
input URL. This function will therefore raise the same errors as
118-
urlparse.urlsplit and handle some inputs differently than extract, such as
119-
URLs missing a scheme.
115+
def __call__(self, url):
116+
"""
117+
Takes a string URL and splits it into its subdomain, domain, and
118+
gTLD/ccTLD component.
119+
120+
>>> extract = TLDExtract()
121+
>>> extract('http://forums.news.cnn.com/')
122+
ExtractResult(subdomain='forums.news', domain='cnn', tld='com')
123+
>>> extract('http://forums.bbc.co.uk/')
124+
ExtractResult(subdomain='forums', domain='bbc', tld='co.uk')
125+
"""
126+
netloc = SCHEME_RE.sub("", url).partition("/")[0]
127+
return self._extract(netloc)
128+
129+
def _extract(self, netloc):
130+
netloc = netloc.split("@")[-1].partition(':')[0]
131+
registered_domain, tld = self._get_tld_extractor().extract(netloc)
132+
if not tld and netloc and netloc[0].isdigit():
133+
try:
134+
is_ip = socket.inet_aton(netloc)
135+
return ExtractResult('', netloc, '')
136+
except AttributeError:
137+
if IP_RE.match(netloc):
138+
return ExtractResult('', netloc, '')
139+
except socket.error:
140+
pass
120141

121-
>>> urlsplit('http://forums.news.cnn.com/')
122-
ExtractResult(subdomain='forums.news', domain='cnn', tld='com')
123-
>>> urlsplit('forums.bbc.co.uk/') # urlsplit won't see a netloc
124-
ExtractResult(subdomain='', domain='', tld='')
125-
"""
126-
netloc = urlparse.urlsplit(url).netloc
127-
return _extract(netloc, fetch, cache_file)
128-
129-
def _extract(netloc, fetch=True, cache_file=''):
130-
netloc = netloc.split("@")[-1].partition(':')[0]
131-
registered_domain, tld = _get_tld_extractor(fetch, cache_file).extract(netloc)
132-
if not tld and netloc and netloc[0].isdigit():
142+
subdomain, _, domain = registered_domain.rpartition('.')
143+
return ExtractResult(subdomain, domain, tld)
144+
145+
def _get_tld_extractor(self):
146+
if self._extractor:
147+
return self._extractor
148+
149+
moddir = os.path.dirname(__file__)
150+
cached_file = self.cache_file or os.path.join(moddir, '.tld_set')
133151
try:
134-
is_ip = socket.inet_aton(netloc)
135-
return ExtractResult('', netloc, '')
136-
except AttributeError:
137-
if IP_RE.match(netloc):
138-
return ExtractResult('', netloc, '')
139-
except socket.error:
152+
with open(cached_file) as f:
153+
self._extractor = _PublicSuffixListTLDExtractor(pickle.load(f))
154+
return self._extractor
155+
except IOError, file_not_found:
140156
pass
141157

142-
subdomain, _, domain = registered_domain.rpartition('.')
143-
return ExtractResult(subdomain, domain, tld)
158+
tlds = frozenset()
159+
if self.fetch:
160+
tld_sources = (_PublicSuffixListSource,)
161+
tlds = frozenset(tld for tld_source in tld_sources for tld in tld_source())
162+
163+
if not tlds:
164+
with pkg_resources.resource_stream(__name__, '.tld_set_snapshot') as snapshot_file:
165+
self._extractor = _PublicSuffixListTLDExtractor(pickle.load(snapshot_file))
166+
return self._extractor
167+
168+
LOG.info("computed TLDs: %s", tlds)
169+
if LOG.isEnabledFor(logging.DEBUG):
170+
import difflib
171+
with pkg_resources.resource_stream(__name__, '.tld_set_snapshot') as snapshot_file:
172+
snapshot = sorted(pickle.load(snapshot_file))
173+
new = sorted(tlds)
174+
for line in difflib.unified_diff(snapshot, new, fromfile=".tld_set_snapshot", tofile=cached_file):
175+
print >> sys.stderr, line
176+
177+
try:
178+
with open(cached_file, 'w') as f:
179+
pickle.dump(tlds, f)
180+
except IOError, e:
181+
LOG.warn("unable to cache TLDs in file %s: %s", cached_file, e)
144182

145-
TLD_EXTRACTOR = None
183+
self._extractor = _PublicSuffixListTLDExtractor(tlds)
184+
return self._extractor
146185

147-
def _get_tld_extractor(fetch=True, cache_file=''):
148-
global TLD_EXTRACTOR
149-
if TLD_EXTRACTOR:
150-
return TLD_EXTRACTOR
186+
TLD_EXTRACTOR = TLDExtract()
151187

152-
moddir = os.path.dirname(__file__)
153-
cached_file = cache_file or os.path.join(moddir, '.tld_set')
154-
try:
155-
with open(cached_file) as f:
156-
TLD_EXTRACTOR = _PublicSuffixListTLDExtractor(pickle.load(f))
157-
return TLD_EXTRACTOR
158-
except IOError, file_not_found:
159-
pass
160-
161-
tlds = frozenset()
162-
if fetch:
163-
tld_sources = (_PublicSuffixListSource,)
164-
tlds = frozenset(tld for tld_source in tld_sources for tld in tld_source())
165-
166-
if not tlds:
167-
with pkg_resources.resource_stream(__name__, '.tld_set_snapshot') as snapshot_file:
168-
TLD_EXTRACTOR = _PublicSuffixListTLDExtractor(pickle.load(snapshot_file))
169-
return TLD_EXTRACTOR
170-
171-
LOG.info("computed TLDs: %s", tlds)
172-
if LOG.isEnabledFor(logging.DEBUG):
173-
import difflib
174-
with pkg_resources.resource_stream(__name__, '.tld_set_snapshot') as snapshot_file:
175-
snapshot = sorted(pickle.load(snapshot_file))
176-
new = sorted(tlds)
177-
for line in difflib.unified_diff(snapshot, new, fromfile=".tld_set_snapshot", tofile=cached_file):
178-
print >> sys.stderr, line
179-
180-
try:
181-
with open(cached_file, 'w') as f:
182-
pickle.dump(tlds, f)
183-
except IOError, e:
184-
LOG.warn("unable to cache TLDs in file %s: %s", cached_file, e)
185-
186-
TLD_EXTRACTOR = _PublicSuffixListTLDExtractor(tlds)
187-
return TLD_EXTRACTOR
188+
@wraps(TLD_EXTRACTOR.__call__)
189+
def extract(url):
190+
return TLD_EXTRACTOR(url)
191+
192+
@wraps(TLD_EXTRACTOR.__call__)
193+
def urlsplit(url):
194+
warnings.warn("Global tldextract.urlsplit function will be removed in 1.0. Call urlparse.urlsplit before calling tldextract.", DeprecationWarning)
195+
return TLD_EXTRACTOR(urlparse.urlsplit(url).netloc)
188196

189197
def _fetch_page(url):
190198
try:

0 commit comments

Comments
 (0)