Skip to content

Commit

Permalink
Much faster implementation of FileList, for big egg_info speedups
Browse files Browse the repository at this point in the history
  • Loading branch information
mx-moth committed Oct 14, 2016
1 parent 7edbffc commit d31f12e
Show file tree
Hide file tree
Showing 3 changed files with 468 additions and 62 deletions.
297 changes: 249 additions & 48 deletions setuptools/command/egg_info.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,40 +2,123 @@
Create a distribution's .egg-info directory and contents"""

from distutils.filelist import FileList as _FileList
from distutils.util import convert_path
from distutils import log
import collections
import distutils.errors
import distutils.filelist
import io
import os
import re
import sys
import io
import warnings
import time
import collections

from setuptools.extern import six
from setuptools.extern.six.moves import map
import warnings
from distutils import log
from distutils.filelist import FileList as _FileList
from distutils.filelist import translate_pattern
from distutils.util import convert_path
from fnmatch import translate

from setuptools import Command
from setuptools.command.sdist import sdist
from setuptools.command.sdist import walk_revctrl
from setuptools.command.setopt import edit_config
from setuptools.command import bdist_egg
from pkg_resources import (
parse_requirements, safe_name, parse_version,
safe_version, yield_lines, EntryPoint, iter_entry_points, to_filename)
import setuptools.unicode_utils as unicode_utils

from pkg_resources import (
EntryPoint, iter_entry_points, parse_requirements, parse_version, safe_name,
safe_version, to_filename, yield_lines)
from pkg_resources.extern import packaging
from setuptools import Command
from setuptools.command import bdist_egg
from setuptools.command.sdist import sdist, walk_revctrl
from setuptools.command.setopt import edit_config
from setuptools.extern import six
from setuptools.extern.six.moves import map
from setuptools.glob import glob

try:
from setuptools_svn import svn_utils
except ImportError:
pass



def translate_pattern(glob):
"""
Translate a file path glob like '*.txt' in to a regular expression.
This differs from fnmatch.translate which allows wildcards to match
directory separators. It also knows about '**/' which matches any number of
directories.
"""
pat = ''

# This will split on '/' within [character classes]. This is deliberate.
chunks = glob.split(os.path.sep)

sep = re.escape(os.sep)
valid_char = '[^%s]' % (sep,)

for c, chunk in enumerate(chunks):
last_chunk = c == len(chunks) - 1

# Chunks that are a literal ** are globstars. They match anything.
if chunk == '**':
if last_chunk:
# Match anything if this is the last component
pat += '.*'
else:
# Match '(name/)*'
pat += '(?:%s+%s)*' % (valid_char, sep)
continue # Break here as the whole path component has been handled

# Find any special characters in the remainder
i = 0
chunk_len = len(chunk)
while i < chunk_len:
char = chunk[i]
if char == '*':
# Match any number of name characters
pat += valid_char + '*'
elif char == '?':
# Match a name character
pat += valid_char
elif char == '[':
# Character class
inner_i = i + 1
# Skip initial !/] chars
if inner_i < chunk_len and chunk[inner_i] == '!':
inner_i = inner_i + 1
if inner_i < chunk_len and chunk[inner_i] == ']':
inner_i = inner_i + 1

# Loop till the closing ] is found
while inner_i < chunk_len and chunk[inner_i] != ']':
inner_i = inner_i + 1

if inner_i >= chunk_len:
# Got to the end of the string without finding a closing ]
# Do not treat this as a matching group, but as a literal [
pat += re.escape(char)
else:
# Grab the insides of the [brackets]
inner = chunk[i + 1:inner_i]
char_class = ''

# Class negation
if inner[0] == '!':
char_class = '^'
inner = inner[1:]

char_class += re.escape(inner)
pat += '[%s]' % (char_class,)

# Skip to the end ]
i = inner_i
else:
pat += re.escape(char)
i += 1

# Join each chunk with the dir separator
if not last_chunk:
pat += sep

return re.compile(pat + r'\Z(?ms)')


class egg_info(Command):
description = "create a distribution's .egg-info directory"

Expand Down Expand Up @@ -239,7 +322,151 @@ def check_broken_egg_info(self):


class FileList(_FileList):
"""File list that accepts only existing, platform-independent paths"""
# Implementations of the various MANIFEST.in commands

def process_template_line(self, line):
# Parse the line: split it up, make sure the right number of words
# is there, and return the relevant words. 'action' is always
# defined: it's the first word of the line. Which of the other
# three are defined depends on the action; it'll be either
# patterns, (dir and patterns), or (dir_pattern).
(action, patterns, dir, dir_pattern) = self._parse_template_line(line)

# OK, now we know that the action is valid and we have the
# right number of words on the line for that action -- so we
# can proceed with minimal error-checking.
if action == 'include':
self.debug_print("include " + ' '.join(patterns))
for pattern in patterns:
if not self.include(pattern):
log.warn("warning: no files found matching '%s'", pattern)

elif action == 'exclude':
self.debug_print("exclude " + ' '.join(patterns))
for pattern in patterns:
if not self.exclude(pattern):
log.warn(("warning: no previously-included files "
"found matching '%s'"), pattern)

elif action == 'global-include':
self.debug_print("global-include " + ' '.join(patterns))
for pattern in patterns:
if not self.global_include(pattern):
log.warn(("warning: no files found matching '%s' "
"anywhere in distribution"), pattern)

elif action == 'global-exclude':
self.debug_print("global-exclude " + ' '.join(patterns))
for pattern in patterns:
if not self.global_exclude(pattern):
log.warn(("warning: no previously-included files matching "
"'%s' found anywhere in distribution"),
pattern)

elif action == 'recursive-include':
self.debug_print("recursive-include %s %s" %
(dir, ' '.join(patterns)))
for pattern in patterns:
if not self.recursive_include(dir, pattern):
log.warn(("warning: no files found matching '%s' "
"under directory '%s'"),
pattern, dir)

elif action == 'recursive-exclude':
self.debug_print("recursive-exclude %s %s" %
(dir, ' '.join(patterns)))
for pattern in patterns:
if not self.recursive_exclude(dir, pattern):
log.warn(("warning: no previously-included files matching "
"'%s' found under directory '%s'"),
pattern, dir)

elif action == 'graft':
self.debug_print("graft " + dir_pattern)
if not self.graft(dir_pattern):
log.warn("warning: no directories found matching '%s'",
dir_pattern)

elif action == 'prune':
self.debug_print("prune " + dir_pattern)
if not self.prune(dir_pattern):
log.warn(("no previously-included directories found "
"matching '%s'"), dir_pattern)

else:
raise DistutilsInternalError(
"this cannot happen: invalid action '%s'" % action)

def _remove_files(self, predicate):
"""
Remove all files from the file list that match the predicate.
Return True if any matching files were removed
"""
found = False
for i in range(len(self.files) - 1, -1, -1):
if predicate(self.files[i]):
self.debug_print(" removing " + self.files[i])
del self.files[i]
found = True
return found

def include(self, pattern):
"""Include files that match 'pattern'."""
found = [f for f in glob(pattern) if not os.path.isdir(f)]
self.extend(found)
return bool(found)

def exclude(self, pattern):
"""Exclude files that match 'pattern'."""
match = translate_pattern(pattern)
return self._remove_files(match.match)

def recursive_include(self, dir, pattern):
"""
Include all files anywhere in 'dir/' that match the pattern.
"""
full_pattern = os.path.join(dir, '**', pattern)
found = [f for f in glob(full_pattern, recursive=True)
if not os.path.isdir(f)]
self.extend(found)
return bool(found)

def recursive_exclude(self, dir, pattern):
"""
Exclude any file anywhere in 'dir/' that match the pattern.
"""
match = translate_pattern(os.path.join(dir, '**', pattern))
return self._remove_files(match.match)

def graft(self, dir):
"""Include all files from 'dir/'."""
found = distutils.filelist.findall(dir)
self.extend(found)
return bool(found)

def prune(self, dir):
"""Filter out files from 'dir/'."""
match = translate_pattern(os.path.join(dir, '**'))
return self._remove_files(match.match)

def global_include(self, pattern):
"""
Include all files anywhere in the current directory that match the
pattern. This is very inefficient on large file trees.
"""
if self.allfiles is None:
self.findall()
match = translate_pattern(os.path.join('**', pattern))
found = [f for f in self.allfiles if match.match(f)]
self.extend(found)
return bool(found)

def global_exclude(self, pattern):
"""
Exclude all files anywhere that match the pattern.
"""
match = translate_pattern(os.path.join('**', pattern))
return self._remove_files(match.match)

def append(self, item):
if item.endswith('\r'): # Fix older sdists built on Windows
Expand Down Expand Up @@ -302,7 +529,6 @@ def run(self):
self.filelist = FileList()
if not os.path.exists(self.manifest):
self.write_manifest() # it must exist so it'll get in the list
self.filelist.findall()
self.add_defaults()
if os.path.exists(self.template):
self.read_template()
Expand Down Expand Up @@ -341,38 +567,13 @@ def add_defaults(self):
elif os.path.exists(self.manifest):
self.read_manifest()
ei_cmd = self.get_finalized_command('egg_info')
self._add_egg_info(cmd=ei_cmd)
self.filelist.include_pattern("*", prefix=ei_cmd.egg_info)

def _add_egg_info(self, cmd):
"""
Add paths for egg-info files for an external egg-base.
The egg-info files are written to egg-base. If egg-base is
outside the current working directory, this method
searchs the egg-base directory for files to include
in the manifest. Uses distutils.filelist.findall (which is
really the version monkeypatched in by setuptools/__init__.py)
to perform the search.
Since findall records relative paths, prefix the returned
paths with cmd.egg_base, so add_default's include_pattern call
(which is looking for the absolute cmd.egg_info) will match
them.
"""
if cmd.egg_base == os.curdir:
# egg-info files were already added by something else
return

discovered = distutils.filelist.findall(cmd.egg_base)
resolved = (os.path.join(cmd.egg_base, path) for path in discovered)
self.filelist.allfiles.extend(resolved)
self.filelist.graft(ei_cmd.egg_info)

def prune_file_list(self):
build = self.get_finalized_command('build')
base_dir = self.distribution.get_fullname()
self.filelist.exclude_pattern(None, prefix=build.build_base)
self.filelist.exclude_pattern(None, prefix=base_dir)
self.filelist.prune(build.build_base)
self.filelist.prune(base_dir)
sep = re.escape(os.sep)
self.filelist.exclude_pattern(r'(^|' + sep + r')(RCS|CVS|\.svn)' + sep,
is_regex=1)
Expand Down
Loading

0 comments on commit d31f12e

Please sign in to comment.