Skip to content

Improve performance of deletions in initial fine-grained cache load #4701

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 7 commits into from
Mar 9, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
77 changes: 54 additions & 23 deletions mypy/build.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,7 @@ class BuildResult:
manager: The build manager.
files: Dictionary from module name to related AST node.
types: Dictionary from parse tree node to its inferred type.
used_cache: Whether the build took advantage of a cache
errors: List of error messages.
"""

Expand All @@ -88,6 +89,7 @@ def __init__(self, manager: 'BuildManager', graph: Graph) -> None:
self.graph = graph
self.files = manager.modules
self.types = manager.all_types # Non-empty for tests only or if dumping deps
self.used_cache = manager.cache_enabled
self.errors = [] # type: List[str] # Filled in by build if desired


Expand Down Expand Up @@ -569,6 +571,9 @@ class BuildManager:
flush_errors: A function for processing errors after each SCC
saved_cache: Dict with saved cache state for coarse-grained dmypy
(read-write!)
cache_enabled: Whether cache usage is enabled. This is set based on options,
but is disabled if fine-grained cache loading fails
and after an initial fine-grained load.
stats: Dict with various instrumentation numbers
"""

Expand All @@ -588,7 +593,6 @@ def __init__(self, data_dir: str,
self.data_dir = data_dir
self.errors = errors
self.errors.set_ignore_prefix(ignore_prefix)
self.only_load_from_cache = options.use_fine_grained_cache
self.lib_path = tuple(lib_path)
self.source_set = source_set
self.reports = reports
Expand All @@ -607,9 +611,14 @@ def __init__(self, data_dir: str,
self.rechecked_modules = set() # type: Set[str]
self.plugin = plugin
self.flush_errors = flush_errors
self.cache_enabled = options.incremental and (
not options.fine_grained_incremental or options.use_fine_grained_cache)
self.saved_cache = saved_cache if saved_cache is not None else {} # type: SavedCache
self.stats = {} # type: Dict[str, Any] # Values are ints or floats

def use_fine_grained_cache(self) -> bool:
return self.cache_enabled and self.options.use_fine_grained_cache

def maybe_swap_for_shadow_path(self, path: str) -> str:
if (self.options.shadow_file and
os.path.samefile(self.options.shadow_file[0], path)):
Expand Down Expand Up @@ -1157,7 +1166,7 @@ def validate_meta(meta: Optional[CacheMeta], id: str, path: Optional[str],
# changed since the cache was generated. We *don't* want to do a
# coarse-grained incremental rebuild, so we accept the cache
# metadata even if it doesn't match the source file.
if manager.options.use_fine_grained_cache:
if manager.use_fine_grained_cache():
manager.log('Using potentially stale metadata for {}'.format(id))
return meta

Expand Down Expand Up @@ -1655,7 +1664,7 @@ def __init__(self,
self.path = path
self.xpath = path or '<string>'
self.source = source
if path and source is None and self.options.incremental:
if path and source is None and self.manager.cache_enabled:
self.meta = find_cache_meta(self.id, path, manager)
# TODO: Get mtime if not cached.
if self.meta is not None:
Expand All @@ -1675,10 +1684,10 @@ def __init__(self,
for id, line in zip(self.meta.dependencies, self.meta.dep_lines)}
self.child_modules = set(self.meta.child_modules)
else:
# In fine-grained cache mode, pretend we only know about modules that
# have cache information and defer handling new modules until the
# fine-grained update.
if manager.only_load_from_cache:
# When doing a fine-grained cache load, pretend we only
# know about modules that have cache information and defer
# handling new modules until the fine-grained update.
if manager.use_fine_grained_cache():
manager.log("Deferring module to fine-grained update %s (%s)" % (path, id))
raise ModuleNotFound

Expand Down Expand Up @@ -1795,13 +1804,15 @@ def load_tree(self) -> None:

def fix_cross_refs(self) -> None:
assert self.tree is not None, "Internal error: method must be called on parsed file only"
# We need to set quick_and_dirty when doing a fine grained
# cache load because we need to gracefully handle missing modules.
fixup_module_pass_one(self.tree, self.manager.modules,
self.manager.options.quick_and_dirty)
self.manager.options.quick_and_dirty or
self.manager.use_fine_grained_cache())

def calculate_mros(self) -> None:
assert self.tree is not None, "Internal error: method must be called on parsed file only"
fixup_module_pass_two(self.tree, self.manager.modules,
self.manager.options.quick_and_dirty)
fixup_module_pass_two(self.tree, self.manager.modules)

def patch_dependency_parents(self) -> None:
"""
Expand Down Expand Up @@ -2058,7 +2069,7 @@ def valid_references(self) -> Set[str]:

def write_cache(self) -> None:
assert self.tree is not None, "Internal error: method must be called on parsed file only"
if not self.path or self.options.cache_dir == os.devnull:
if not self.path or not self.manager.cache_enabled:
return
if self.manager.options.quick_and_dirty:
is_errors = self.manager.errors.is_errors_for_file(self.path)
Expand Down Expand Up @@ -2105,9 +2116,12 @@ def dispatch(sources: List[BuildSource], manager: BuildManager) -> Graph:
# This is a kind of unfortunate hack to work around some of fine-grained's
# fragility: if we have loaded less than 50% of the specified files from
# cache in fine-grained cache mode, load the graph again honestly.
if manager.options.use_fine_grained_cache and len(graph) < 0.50 * len(sources):
manager.log("Redoing load_graph because too much was missing")
manager.only_load_from_cache = False
# In this case, we just turn the cache off entirely, so we don't need
# to worry about some files being loaded and some from cache and so
# that fine-grained mode never *writes* to the cache.
if manager.use_fine_grained_cache() and len(graph) < 0.50 * len(sources):
manager.log("Redoing load_graph without cache because too much was missing")
manager.cache_enabled = False
graph = load_graph(sources, manager)

t1 = time.time()
Expand All @@ -2128,7 +2142,13 @@ def dispatch(sources: List[BuildSource], manager: BuildManager) -> Graph:
if manager.options.dump_graph:
dump_graph(graph)
return graph
process_graph(graph, manager)
# If we are loading a fine-grained incremental mode cache, we
# don't want to do a real incremental reprocess of the graph---we
# just want to load in all of the cache information.
if manager.use_fine_grained_cache():
process_fine_grained_cache_graph(graph, manager)
else:
process_graph(graph, manager)
updated = preserve_cache(graph)
set_updated = set(updated)
manager.saved_cache.clear()
Expand Down Expand Up @@ -2437,14 +2457,6 @@ def process_graph(graph: Graph, manager: BuildManager) -> None:
manager.log("Processing SCC of size %d (%s) as %s" % (size, scc_str, fresh_msg))
process_stale_scc(graph, scc, manager)

# If we are running in fine-grained incremental mode with caching,
# we always process fresh SCCs so that we have all of the symbol
# tables and fine-grained dependencies available.
if manager.options.use_fine_grained_cache:
for prev_scc in fresh_scc_queue:
process_fresh_scc(graph, prev_scc, manager)
fresh_scc_queue = []

sccs_left = len(fresh_scc_queue)
nodes_left = sum(len(scc) for scc in fresh_scc_queue)
manager.add_stats(sccs_left=sccs_left, nodes_left=nodes_left)
Expand All @@ -2456,6 +2468,25 @@ def process_graph(graph: Graph, manager: BuildManager) -> None:
manager.log("No fresh SCCs left in queue")


def process_fine_grained_cache_graph(graph: Graph, manager: BuildManager) -> None:
"""Finish loading everything for use in the fine-grained incremental cache"""

# If we are running in fine-grained incremental mode with caching,
# we process all SCCs as fresh SCCs so that we have all of the symbol
# tables and fine-grained dependencies available.
# We fail the loading of any SCC that we can't load a meta for, so we
# don't have anything *but* fresh SCCs.
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Have you tested what happens if we fail loading some SCCs? Will they be picked up by the build later?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, that is tested by the testAddModuleAfterCache tests.

sccs = sorted_components(graph)
manager.log("Found %d SCCs; largest has %d nodes" %
(len(sccs), max(len(scc) for scc in sccs)))

for ascc in sccs:
# Order the SCC's nodes using a heuristic.
# Note that ascc is a set, and scc is a list.
scc = order_ascc(graph, ascc)
process_fresh_scc(graph, scc, manager)


def order_ascc(graph: Graph, ascc: AbstractSet[str], pri_max: int = PRI_ALL) -> List[str]:
"""Come up with the ideal processing order within an SCC.

Expand Down
19 changes: 13 additions & 6 deletions mypy/dmypy_server.py
Original file line number Diff line number Diff line change
Expand Up @@ -273,9 +273,6 @@ def initialize_fine_grained(self, sources: List[mypy.build.BuildSource]) -> Dict
self.fscache = FileSystemCache(self.options.python_version)
self.fswatcher = FileSystemWatcher(self.fscache)
self.update_sources(sources)
if not self.options.use_fine_grained_cache:
# Stores the initial state of sources as a side effect.
self.fswatcher.find_changed()
try:
result = mypy.build.build(sources=sources,
options=self.options,
Expand All @@ -292,12 +289,11 @@ def initialize_fine_grained(self, sources: List[mypy.build.BuildSource]) -> Dict
graph = result.graph
self.fine_grained_manager = FineGrainedBuildManager(manager, graph)
self.previous_sources = sources
self.fscache.flush()

# If we are using the fine-grained cache, build hasn't actually done
# the typechecking on the updated files yet.
# Run a fine-grained update starting from the cached data
if self.options.use_fine_grained_cache:
if result.used_cache:
# Pull times and hashes out of the saved_cache and stick them into
# the fswatcher, so we pick up the changes.
for state in self.fine_grained_manager.graph.values():
Expand All @@ -310,9 +306,20 @@ def initialize_fine_grained(self, sources: List[mypy.build.BuildSource]) -> Dict

# Run an update
changed = self.find_changed(sources)

# Find anything that has had its dependency list change
for state in self.fine_grained_manager.graph.values():
if not state.is_fresh():
assert state.path is not None
changed.append((state.id, state.path))

if changed:
messages = self.fine_grained_manager.update(changed)
self.fscache.flush()
else:
# Stores the initial state of sources as a side effect.
self.fswatcher.find_changed()

self.fscache.flush()

status = 1 if messages else 0
self.previous_messages = messages[:]
Expand Down
3 changes: 1 addition & 2 deletions mypy/fixup.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,7 @@ def fixup_module_pass_one(tree: MypyFile, modules: Dict[str, MypyFile],
node_fixer.visit_symbol_table(tree.names)


def fixup_module_pass_two(tree: MypyFile, modules: Dict[str, MypyFile],
quick_and_dirty: bool) -> None:
def fixup_module_pass_two(tree: MypyFile, modules: Dict[str, MypyFile]) -> None:
compute_all_mros(tree.names, modules)


Expand Down
4 changes: 3 additions & 1 deletion mypy/server/astdiff.py
Original file line number Diff line number Diff line change
Expand Up @@ -136,7 +136,9 @@ def snapshot_symbol_table(name_prefix: str, table: SymbolTable) -> Dict[str, Sna
common = (fullname, symbol.kind, symbol.module_public)
if symbol.kind == MODULE_REF:
# This is a cross-reference to another module.
assert isinstance(node, MypyFile)
# If the reference is busted because the other module is missing,
# the node will be a "stale_info" TypeInfo produced by fixup,
# but that doesn't really matter to us here.
result[name] = ('Moduleref', common)
elif symbol.kind == TVAR:
assert isinstance(node, TypeVarExpr)
Expand Down
16 changes: 12 additions & 4 deletions mypy/server/update.py
Original file line number Diff line number Diff line change
Expand Up @@ -171,13 +171,17 @@ def __init__(self,
# Module that we haven't processed yet but that are known to be stale.
self.stale = [] # type: List[Tuple[str, str]]
# Disable the cache so that load_graph doesn't try going back to disk
# for the cache. This is kind of a hack and it might be better to have
# this directly reflected in load_graph's interface.
self.options.cache_dir = os.devnull
# for the cache.
self.manager.cache_enabled = False
manager.saved_cache = {}
manager.only_load_from_cache = False

# Some hints to the test suite about what is going on:
# Active triggers during the last update
self.triggered = [] # type: List[str]
# Modules passed to update during the last update
self.changed_modules = [] # type: List[Tuple[str, str]]
# Modules processed during the last update
self.updated_modules = [] # type: List[str]

def update(self, changed_modules: List[Tuple[str, str]]) -> List[str]:
"""Update previous build result by processing changed modules.
Expand All @@ -198,10 +202,13 @@ def update(self, changed_modules: List[Tuple[str, str]]) -> List[str]:
"""
assert changed_modules, 'No changed modules'

self.changed_modules = changed_modules

# Reset global caches for the new build.
find_module_clear_caches()

self.triggered = []
self.updated_modules = []
changed_modules = dedupe_modules(changed_modules + self.stale)
initial_set = {id for id, _ in changed_modules}
self.manager.log_fine_grained('==== update %s ====' % ', '.join(
Expand Down Expand Up @@ -258,6 +265,7 @@ def update_single(self, module: str, path: str) -> Tuple[List[str],
- Whether there was a blocking error in the module
"""
self.manager.log_fine_grained('--- update single %r ---' % module)
self.updated_modules.append(module)

# TODO: If new module brings in other modules, we parse some files multiple times.
manager = self.manager
Expand Down
17 changes: 16 additions & 1 deletion mypy/test/helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
import time
import shutil

from typing import List, Dict, Tuple, Callable, Any, Optional
from typing import List, Iterable, Dict, Tuple, Callable, Any, Optional

from mypy import defaults
from mypy.test.config import test_temp_dir
Expand Down Expand Up @@ -98,6 +98,21 @@ def assert_string_arrays_equal(expected: List[str], actual: List[str],
raise AssertionError(msg)


def assert_module_equivalence(name: str,
expected: Optional[Iterable[str]], actual: Iterable[str]) -> None:
if expected is not None:
expected_normalized = sorted(expected)
actual_normalized = sorted(set(actual).difference({"__main__"}))
assert_string_arrays_equal(
expected_normalized,
actual_normalized,
('Actual modules ({}) do not match expected modules ({}) '
'for "[{} ...]"').format(
', '.join(actual_normalized),
', '.join(expected_normalized),
name))


def update_testcase_output(testcase: DataDrivenTestCase, output: List[str]) -> None:
assert testcase.old_cwd is not None, "test was not properly set up"
testcase_path = os.path.join(testcase.old_cwd, testcase.file)
Expand Down
20 changes: 3 additions & 17 deletions mypy/test/testcheck.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
from mypy.test.config import test_temp_dir
from mypy.test.data import DataDrivenTestCase, DataSuite
from mypy.test.helpers import (
assert_string_arrays_equal, normalize_error_messages,
assert_string_arrays_equal, normalize_error_messages, assert_module_equivalence,
retry_on_error, update_testcase_output, parse_options,
copy_and_fudge_mtime
)
Expand Down Expand Up @@ -190,29 +190,15 @@ def run_case_once(self, testcase: DataDrivenTestCase, incremental_step: int = 0)
self.verify_cache(module_data, a, res.manager)
if incremental_step > 1:
suffix = '' if incremental_step == 2 else str(incremental_step - 1)
self.check_module_equivalence(
assert_module_equivalence(
'rechecked' + suffix,
testcase.expected_rechecked_modules.get(incremental_step - 1),
res.manager.rechecked_modules)
self.check_module_equivalence(
assert_module_equivalence(
'stale' + suffix,
testcase.expected_stale_modules.get(incremental_step - 1),
res.manager.stale_modules)

def check_module_equivalence(self, name: str,
expected: Optional[Set[str]], actual: Set[str]) -> None:
if expected is not None:
expected_normalized = sorted(expected)
actual_normalized = sorted(actual.difference({"__main__"}))
assert_string_arrays_equal(
expected_normalized,
actual_normalized,
('Actual modules ({}) do not match expected modules ({}) '
'for "[{} ...]"').format(
', '.join(actual_normalized),
', '.join(expected_normalized),
name))

def verify_cache(self, module_data: List[Tuple[str, str, str]], a: List[str],
manager: build.BuildManager) -> None:
# There should be valid cache metadata for each module except
Expand Down
Loading