@@ -142,6 +142,7 @@ def gen_tasks(self):
142
142
143
143
def scan_locs ():
144
144
"""Scan site locations."""
145
+ robots_rules = parse_robots_exclusions (kw ['robots_exclusions' ])
145
146
for root , dirs , files in os .walk (output , followlinks = True ):
146
147
if not dirs and not files and not kw ['sitemap_include_fileless_dirs' ]:
147
148
continue # Totally empty, not on sitemap
@@ -174,8 +175,12 @@ def scan_locs():
174
175
if path .endswith (kw ['index_file' ]) and kw ['strip_indexes' ]:
175
176
# ignore index files when stripping urls
176
177
continue
177
- if not robot_fetch (path ):
178
- continue
178
+ if robots_rules :
179
+ abspath = '/' + path
180
+ if sys .version_info [0 ] == 2 :
181
+ abspath = abspath .encode ('utf-8' )
182
+ if not robots_rules .can_fetch ('*' , abspath ):
183
+ continue
179
184
180
185
# read in binary mode to make ancient files work
181
186
fh = open (real_path , 'rb' )
@@ -223,18 +228,19 @@ def scan_locs():
223
228
alternates .append (alternates_format .format (lang , alt_url ))
224
229
urlset [loc ] = loc_format .format (encodelink (loc ), lastmod , '\n ' .join (alternates ))
225
230
226
- def robot_fetch (path ):
227
- """Check if robots can fetch a file."""
228
- for rule in kw ["robots_exclusions" ]:
231
+ def parse_robots_exclusions (exclusions ):
232
+ """Parse rules to check fetchable."""
233
+ rules = []
234
+ for rule in exclusions :
235
+ rules .append ('Disallow: {0}' .format (rule ))
236
+ if len (rules ):
229
237
robot = robotparser .RobotFileParser ()
230
- robot .parse (["User-Agent: *" , "Disallow: {0}" .format (rule )])
231
- if sys .version_info [0 ] == 3 :
232
- if not robot .can_fetch ("*" , '/' + path ):
233
- return False # not robot food
234
- else :
235
- if not robot .can_fetch ("*" , ('/' + path ).encode ('utf-8' )):
236
- return False # not robot food
237
- return True
238
+ rules = ['User-Agent: *' ] + rules
239
+ if sys .version_info [0 ] == 2 :
240
+ rules = [ line .encode ('utf-8' ) for line in rules ]
241
+ robot .parse (rules )
242
+ return robot
243
+ return None
238
244
239
245
def write_sitemap ():
240
246
"""Write sitemap to file."""
0 commit comments