24
24
import cPickle as pickle
25
25
except ImportError :
26
26
import pickle
27
+ from functools import wraps
27
28
import logging
28
29
from operator import itemgetter
29
30
import os
@@ -45,146 +46,153 @@ def resource_stream(cls, package, resource_name):
45
46
import socket
46
47
import urllib2
47
48
import urlparse
49
+ import warnings
48
50
49
51
LOG = logging .getLogger (__file__ )
50
52
51
53
SCHEME_RE = re .compile (r'^([' + urlparse .scheme_chars + ']+:)?//' )
52
54
IP_RE = re .compile (r'^(([0-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5])\.){3}([0-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5])$' )
53
55
54
56
class ExtractResult (tuple ):
55
- 'ExtractResult(subdomain, domain, tld)'
56
- __slots__ = ()
57
- _fields = ('subdomain' , 'domain' , 'tld' )
57
+ 'ExtractResult(subdomain, domain, tld)'
58
+ __slots__ = ()
59
+ _fields = ('subdomain' , 'domain' , 'tld' )
58
60
59
61
def __new__ (_cls , subdomain , domain , tld ):
60
62
'Create new instance of ExtractResult(subdomain, domain, tld)'
61
- return tuple .__new__ (_cls , (subdomain , domain , tld ))
63
+ return tuple .__new__ (_cls , (subdomain , domain , tld ))
62
64
63
65
@classmethod
64
66
def _make (cls , iterable , new = tuple .__new__ , len = len ):
65
67
'Make a new ExtractResult object from a sequence or iterable'
66
68
result = new (cls , iterable )
67
69
if len (result ) != 3 :
68
70
raise TypeError ('Expected 3 arguments, got %d' % len (result ))
69
- return result
71
+ return result
70
72
71
73
def __repr__ (self ):
72
74
'Return a nicely formatted representation string'
73
- return 'ExtractResult(subdomain=%r, domain=%r, tld=%r)' % self
75
+ return 'ExtractResult(subdomain=%r, domain=%r, tld=%r)' % self
74
76
75
77
def _asdict (self ):
76
78
'Return a new dict which maps field names to their values'
77
- return dict (zip (self ._fields , self ))
79
+ return dict (zip (self ._fields , self ))
78
80
79
81
def _replace (_self , ** kwds ):
80
82
'Return a new ExtractResult object replacing specified fields with new values'
81
83
result = _self ._make (map (kwds .pop , ('subdomain' , 'domain' , 'tld' ), _self ))
82
84
if kwds :
83
85
raise ValueError ('Got unexpected field names: %r' % kwds .keys ())
84
- return result
86
+ return result
85
87
86
88
def __getnewargs__ (self ):
87
89
'Return self as a plain tuple. Used by copy and pickle.'
88
- return tuple (self )
90
+ return tuple (self )
89
91
90
92
subdomain = property (itemgetter (0 ), doc = 'Alias for field number 0' )
91
93
domain = property (itemgetter (1 ), doc = 'Alias for field number 1' )
92
94
tld = property (itemgetter (2 ), doc = 'Alias for field number 2' )
93
95
94
- def extract (url , fetch = True , cache_file = '' ):
95
- """
96
- Takes a string URL and splits it into its subdomain, domain, and
97
- gTLD/ccTLD component. Ignores scheme, username, and path components.
96
+ class TLDExtract (object ):
97
+ def __init__ (self , fetch = True , cache_file = '' ):
98
+ """
99
+ Constructs a callable for extracting subdomain, domain, and TLD
100
+ components from a URL.
98
101
99
- If fetch is True (the default) and no cached TLD set is found, this module
100
- will fetch TLD sources live over HTTP on first use. Set to False to
101
- not make HTTP requests. Either way, if the TLD set can't be read, the
102
- module will fall back to the included TLD set snapshot.
102
+ If fetch is True (the default) and no cached TLD set is found, this
103
+ extractor will fetch TLD sources live over HTTP on first use. Set to
104
+ False to not make HTTP requests. Either way, if the TLD set can't be
105
+ read, the module will fall back to the included TLD set snapshot.
103
106
104
- Specifying cache_file will override the location of the TLD set. Defaults
105
- to /path/to/tldextract/.tld_set.
107
+ Specifying cache_file will override the location of the TLD set.
108
+ Defaults to /path/to/tldextract/.tld_set.
106
109
107
- >>> extract('http://forums.news.cnn.com/')
108
- ExtractResult(subdomain='forums.news', domain='cnn', tld='com')
109
- >>> extract('http://forums.bbc.co.uk/')
110
- ExtractResult(subdomain='forums', domain='bbc', tld='co.uk')
111
- """
112
- netloc = SCHEME_RE .sub ("" , url ).partition ("/" )[0 ]
113
- return _extract (netloc , fetch , cache_file )
110
+ """
111
+ self .fetch = fetch
112
+ self .cache_file = cache_file
113
+ self ._extractor = None
114
114
115
- def urlsplit (url , fetch = True , cache_file = '' ):
116
- """Same as `extract` but calls urlparse.urlsplit to further 'validate' the
117
- input URL. This function will therefore raise the same errors as
118
- urlparse.urlsplit and handle some inputs differently than extract, such as
119
- URLs missing a scheme.
115
+ def __call__ (self , url ):
116
+ """
117
+ Takes a string URL and splits it into its subdomain, domain, and
118
+ gTLD/ccTLD component.
119
+
120
+ >>> extract = TLDExtract()
121
+ >>> extract('http://forums.news.cnn.com/')
122
+ ExtractResult(subdomain='forums.news', domain='cnn', tld='com')
123
+ >>> extract('http://forums.bbc.co.uk/')
124
+ ExtractResult(subdomain='forums', domain='bbc', tld='co.uk')
125
+ """
126
+ netloc = SCHEME_RE .sub ("" , url ).partition ("/" )[0 ]
127
+ return self ._extract (netloc )
128
+
129
+ def _extract (self , netloc ):
130
+ netloc = netloc .split ("@" )[- 1 ].partition (':' )[0 ]
131
+ registered_domain , tld = self ._get_tld_extractor ().extract (netloc )
132
+ if not tld and netloc and netloc [0 ].isdigit ():
133
+ try :
134
+ is_ip = socket .inet_aton (netloc )
135
+ return ExtractResult ('' , netloc , '' )
136
+ except AttributeError :
137
+ if IP_RE .match (netloc ):
138
+ return ExtractResult ('' , netloc , '' )
139
+ except socket .error :
140
+ pass
120
141
121
- >>> urlsplit('http://forums.news.cnn.com/')
122
- ExtractResult(subdomain='forums.news', domain='cnn', tld='com')
123
- >>> urlsplit('forums.bbc.co.uk/') # urlsplit won't see a netloc
124
- ExtractResult(subdomain='', domain='', tld='')
125
- """
126
- netloc = urlparse .urlsplit (url ).netloc
127
- return _extract (netloc , fetch , cache_file )
128
-
129
- def _extract (netloc , fetch = True , cache_file = '' ):
130
- netloc = netloc .split ("@" )[- 1 ].partition (':' )[0 ]
131
- registered_domain , tld = _get_tld_extractor (fetch , cache_file ).extract (netloc )
132
- if not tld and netloc and netloc [0 ].isdigit ():
142
+ subdomain , _ , domain = registered_domain .rpartition ('.' )
143
+ return ExtractResult (subdomain , domain , tld )
144
+
145
+ def _get_tld_extractor (self ):
146
+ if self ._extractor :
147
+ return self ._extractor
148
+
149
+ moddir = os .path .dirname (__file__ )
150
+ cached_file = self .cache_file or os .path .join (moddir , '.tld_set' )
133
151
try :
134
- is_ip = socket .inet_aton (netloc )
135
- return ExtractResult ('' , netloc , '' )
136
- except AttributeError :
137
- if IP_RE .match (netloc ):
138
- return ExtractResult ('' , netloc , '' )
139
- except socket .error :
152
+ with open (cached_file ) as f :
153
+ self ._extractor = _PublicSuffixListTLDExtractor (pickle .load (f ))
154
+ return self ._extractor
155
+ except IOError , file_not_found :
140
156
pass
141
157
142
- subdomain , _ , domain = registered_domain .rpartition ('.' )
143
- return ExtractResult (subdomain , domain , tld )
158
+ tlds = frozenset ()
159
+ if self .fetch :
160
+ tld_sources = (_PublicSuffixListSource ,)
161
+ tlds = frozenset (tld for tld_source in tld_sources for tld in tld_source ())
162
+
163
+ if not tlds :
164
+ with pkg_resources .resource_stream (__name__ , '.tld_set_snapshot' ) as snapshot_file :
165
+ self ._extractor = _PublicSuffixListTLDExtractor (pickle .load (snapshot_file ))
166
+ return self ._extractor
167
+
168
+ LOG .info ("computed TLDs: %s" , tlds )
169
+ if LOG .isEnabledFor (logging .DEBUG ):
170
+ import difflib
171
+ with pkg_resources .resource_stream (__name__ , '.tld_set_snapshot' ) as snapshot_file :
172
+ snapshot = sorted (pickle .load (snapshot_file ))
173
+ new = sorted (tlds )
174
+ for line in difflib .unified_diff (snapshot , new , fromfile = ".tld_set_snapshot" , tofile = cached_file ):
175
+ print >> sys .stderr , line
176
+
177
+ try :
178
+ with open (cached_file , 'w' ) as f :
179
+ pickle .dump (tlds , f )
180
+ except IOError , e :
181
+ LOG .warn ("unable to cache TLDs in file %s: %s" , cached_file , e )
144
182
145
- TLD_EXTRACTOR = None
183
+ self ._extractor = _PublicSuffixListTLDExtractor (tlds )
184
+ return self ._extractor
146
185
147
- def _get_tld_extractor (fetch = True , cache_file = '' ):
148
- global TLD_EXTRACTOR
149
- if TLD_EXTRACTOR :
150
- return TLD_EXTRACTOR
186
+ TLD_EXTRACTOR = TLDExtract ()
151
187
152
- moddir = os .path .dirname (__file__ )
153
- cached_file = cache_file or os .path .join (moddir , '.tld_set' )
154
- try :
155
- with open (cached_file ) as f :
156
- TLD_EXTRACTOR = _PublicSuffixListTLDExtractor (pickle .load (f ))
157
- return TLD_EXTRACTOR
158
- except IOError , file_not_found :
159
- pass
160
-
161
- tlds = frozenset ()
162
- if fetch :
163
- tld_sources = (_PublicSuffixListSource ,)
164
- tlds = frozenset (tld for tld_source in tld_sources for tld in tld_source ())
165
-
166
- if not tlds :
167
- with pkg_resources .resource_stream (__name__ , '.tld_set_snapshot' ) as snapshot_file :
168
- TLD_EXTRACTOR = _PublicSuffixListTLDExtractor (pickle .load (snapshot_file ))
169
- return TLD_EXTRACTOR
170
-
171
- LOG .info ("computed TLDs: %s" , tlds )
172
- if LOG .isEnabledFor (logging .DEBUG ):
173
- import difflib
174
- with pkg_resources .resource_stream (__name__ , '.tld_set_snapshot' ) as snapshot_file :
175
- snapshot = sorted (pickle .load (snapshot_file ))
176
- new = sorted (tlds )
177
- for line in difflib .unified_diff (snapshot , new , fromfile = ".tld_set_snapshot" , tofile = cached_file ):
178
- print >> sys .stderr , line
179
-
180
- try :
181
- with open (cached_file , 'w' ) as f :
182
- pickle .dump (tlds , f )
183
- except IOError , e :
184
- LOG .warn ("unable to cache TLDs in file %s: %s" , cached_file , e )
185
-
186
- TLD_EXTRACTOR = _PublicSuffixListTLDExtractor (tlds )
187
- return TLD_EXTRACTOR
188
+ @wraps (TLD_EXTRACTOR .__call__ )
189
+ def extract (url ):
190
+ return TLD_EXTRACTOR (url )
191
+
192
+ @wraps (TLD_EXTRACTOR .__call__ )
193
+ def urlsplit (url ):
194
+ warnings .warn ("Global tldextract.urlsplit function will be removed in 1.0. Call urlparse.urlsplit before calling tldextract." , DeprecationWarning )
195
+ return TLD_EXTRACTOR (urlparse .urlsplit (url ).netloc )
188
196
189
197
def _fetch_page (url ):
190
198
try :
0 commit comments