Skip to content

Commit 8833af2

Browse files
committed
Refactor checking robots_exclusions
1 parent 140ebdf commit 8833af2

File tree

1 file changed

+19
-13
lines changed

1 file changed

+19
-13
lines changed

nikola/plugins/task/sitemap/__init__.py

Lines changed: 19 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -142,6 +142,7 @@ def gen_tasks(self):
142142

143143
def scan_locs():
144144
"""Scan site locations."""
145+
robots_rules = parse_robots_exclusions(kw['robots_exclusions'])
145146
for root, dirs, files in os.walk(output, followlinks=True):
146147
if not dirs and not files and not kw['sitemap_include_fileless_dirs']:
147148
continue # Totally empty, not on sitemap
@@ -174,8 +175,12 @@ def scan_locs():
174175
if path.endswith(kw['index_file']) and kw['strip_indexes']:
175176
# ignore index files when stripping urls
176177
continue
177-
if not robot_fetch(path):
178-
continue
178+
if robots_rules:
179+
abspath = '/' + path
180+
if sys.version_info[0] == 2:
181+
abspath = abspath.encode('utf-8')
182+
if not robots_rules.can_fetch('*', abspath):
183+
continue
179184

180185
# read in binary mode to make ancient files work
181186
fh = open(real_path, 'rb')
@@ -223,18 +228,19 @@ def scan_locs():
223228
alternates.append(alternates_format.format(lang, alt_url))
224229
urlset[loc] = loc_format.format(encodelink(loc), lastmod, '\n'.join(alternates))
225230

226-
def robot_fetch(path):
227-
"""Check if robots can fetch a file."""
228-
for rule in kw["robots_exclusions"]:
231+
def parse_robots_exclusions(exclusions):
232+
"""Parse rules to check fetchable."""
233+
rules = []
234+
for rule in exclusions:
235+
rules.append('Disallow: {0}'.format(rule))
236+
if len(rules):
229237
robot = robotparser.RobotFileParser()
230-
robot.parse(["User-Agent: *", "Disallow: {0}".format(rule)])
231-
if sys.version_info[0] == 3:
232-
if not robot.can_fetch("*", '/' + path):
233-
return False # not robot food
234-
else:
235-
if not robot.can_fetch("*", ('/' + path).encode('utf-8')):
236-
return False # not robot food
237-
return True
238+
rules = ['User-Agent: *'] + rules
239+
if sys.version_info[0] == 2:
240+
rules = [ line.encode('utf-8') for line in rules ]
241+
robot.parse(rules)
242+
return robot
243+
return None
238244

239245
def write_sitemap():
240246
"""Write sitemap to file."""

0 commit comments

Comments
 (0)