Skip to content
Merged
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
Python extractor: in overlay mode, traverse only changed files
- fall back to full extraction on overlay changes json read error
- we filter both root modules and (transitive) imports against the overlay-changes json.
  • Loading branch information
d10c committed Oct 6, 2025
commit 49b18db0440f798b9b034e7201893bbb67065dba
42 changes: 36 additions & 6 deletions python/extractor/semmle/worker.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
from semmle.profiling import get_profiler
from semmle.path_rename import renamer_from_options_and_env
from semmle.logging import WARN, recursion_error_message, internal_error_message, Logger
from semmle.util import FileExtractable, FolderExtractable

class ExtractorFailure(Exception):
'Generic exception representing the failure of an extractor.'
Expand All @@ -19,17 +20,32 @@ class ExtractorFailure(Exception):

class ModuleImportGraph(object):

def __init__(self, max_depth):
def __init__(self, max_depth, logger: Logger):
self.modules = {}
self.succ = defaultdict(set)
self.todo = set()
self.done = set()
self.max_depth = max_depth
self.logger = logger

# During overlay extraction, only traverse the files that were changed.
self.overlay_changes = None
if 'CODEQL_EXTRACTOR_PYTHON_OVERLAY_CHANGES' in os.environ:
overlay_changes_file = os.environ['CODEQL_EXTRACTOR_PYTHON_OVERLAY_CHANGES']
logger.info("Overlay extraction mode: only extracting files changed according to '%s'", overlay_changes_file)
try:
with open(overlay_changes_file, 'r', encoding='utf-8') as f:
data = json.load(f)
changed_paths = data.get('changes', [])
self.overlay_changes = { os.path.abspath(p) for p in changed_paths }
except (IOError, ValueError) as e:
logger.warn("Failed to read overlay changes from '%s' (falling back to full extraction): %s", overlay_changes_file, e)
self.overlay_changes = None

def add_root(self, mod):
self.modules[mod] = 0
if mod not in self.done:
self.todo.add(mod)
self.add_todo(mod)

def add_import(self, mod, imported):
assert mod in self.modules
Expand All @@ -39,7 +55,7 @@ def add_import(self, mod, imported):
self._reduce_depth(imported, self.modules[mod] + 1)
else:
if self.modules[mod] < self.max_depth and imported not in self.done:
self.todo.add(imported)
self.add_todo(imported)
self.modules[imported] = self.modules[mod] + 1

def _reduce_depth(self, mod, depth):
Expand All @@ -48,7 +64,7 @@ def _reduce_depth(self, mod, depth):
if depth > self.max_depth:
return
if mod not in self.done:
self.todo.add(mod)
self.add_todo(mod)
self.modules[mod] = depth
for imp in self.succ[mod]:
self._reduce_depth(imp, depth+1)
Expand All @@ -61,11 +77,25 @@ def get(self):

def push_back(self, mod):
self.done.remove(mod)
self.todo.add(mod)
self.add_todo(mod)

def empty(self):
return not self.todo

def add_todo(self, mod):
if not self._module_in_overlay_changes(mod):
self.logger.debug("Skipping module '%s' as it was not changed in overlay extraction.", mod)
return
self.todo.add(mod)

def _module_in_overlay_changes(self, mod):
if self.overlay_changes is not None:
if isinstance(mod, FileExtractable):
return mod.path in self.overlay_changes
if isinstance(mod, FolderExtractable):
return mod.path + '/__init__.py' in self.overlay_changes
return True

class ExtractorPool(object):
'''Pool of worker processes running extractors'''

Expand All @@ -90,7 +120,7 @@ def __init__(self, outdir, archive, proc_count, options, logger: Logger):
self.enqueued = set()
self.done = set()
self.requirements = {}
self.import_graph = ModuleImportGraph(options.max_import_depth)
self.import_graph = ModuleImportGraph(options.max_import_depth, logger)
logger.debug("Source archive: %s", archive)
self.logger = logger
DiagnosticsWriter.create_output_dir()
Expand Down