39
39
from typing import Tuple
40
40
import xml .etree .ElementTree as ET
41
41
42
- import pygit2
42
+ from git import Repo , DiffIndex
43
43
from packageurl import PackageURL
44
44
45
45
from vulnerabilities .oval_parser import OvalParser
@@ -319,35 +319,41 @@ def _collect_file_changes(
319
319
file_ext : Optional [str ],
320
320
) -> Tuple [Set [str ], Set [str ]]:
321
321
322
- previous_commit = None
323
322
added_files , updated_files = set (), set ()
324
323
325
- for commit in self . _repo . walk ( self . _repo . head . target , pygit2 . GIT_SORT_TIME ):
326
- commit_time = commit . commit_time + commit . commit_time_offset # convert to UTC
327
-
328
- if commit_time < self .cutoff_timestamp :
324
+ # find the most ancient commit we need to diff with
325
+ cutoff_commit = None
326
+ for commit in self . _repo . iter_commits ( self . _repo . head ):
327
+ if commit . committed_date < self .cutoff_timestamp :
329
328
break
329
+ cutoff_commit = commit
330
330
331
- if previous_commit is None :
332
- previous_commit = commit
333
- continue
331
+ if cutoff_commit is None :
332
+ return added_files , updated_files
334
333
335
- for d in commit .tree .diff_to_tree (previous_commit .tree ).deltas :
336
- if not _include_file (d .new_file .path , subdir , recursive , file_ext ) or d .is_binary :
337
- continue
334
+ def _is_binary (d : DiffIndex ):
335
+ if not d .b_blob :
336
+ return False
337
+ try :
338
+ d .b_blob .data_stream .read ().decode ()
339
+ except UnicodeDecodeError :
340
+ return True
341
+ return False
342
+
343
+ for d in cutoff_commit .diff (self ._repo .head .commit ):
344
+ if not _include_file (d .b_path , subdir , recursive , file_ext ) or _is_binary (d ):
345
+ continue
338
346
339
- abspath = os .path .join (self .config .working_directory , d .new_file . path )
340
- # TODO
341
- # Just filtering on the two status values for "added" and "modified" is too
342
- # simplistic. This does not cover file renames, copies &
343
- # deletions.
344
- if d . status == pygit2 . GIT_DELTA_ADDED :
347
+ abspath = os .path .join (self .config .working_directory , d .b_path )
348
+ if d . new_file :
349
+ added_files . add ( abspath )
350
+ elif d . a_blob and d . b_blob :
351
+ if d . a_path != d . b_path :
352
+ # consider moved files as added
345
353
added_files .add (abspath )
346
- elif d .status == pygit2 . GIT_DELTA_MODIFIED :
354
+ elif d .a_blob != d . b_blob :
347
355
updated_files .add (abspath )
348
356
349
- previous_commit = commit
350
-
351
357
# Any file that has been added and then updated inside the window of the git history we
352
358
# looked at, should be considered "added", not "updated", since it does not exist in the
353
359
# database yet.
@@ -364,29 +370,26 @@ def _ensure_working_directory(self) -> None:
364
370
os .mkdir (self .config .working_directory )
365
371
366
372
def _ensure_repository (self ) -> None :
367
- repodir = pygit2 .discover_repository (self .config .working_directory )
368
- if repodir is None :
373
+ if not os .path .exists (os .path .join (self .config .working_directory , ".git" )):
369
374
self ._clone_repository ()
370
375
return
371
-
372
- self ._repo = pygit2 .Repository (repodir )
376
+ self ._repo = Repo (self .config .working_directory )
373
377
374
378
if self .config .branch is None :
375
- self .config .branch = self ._repo .head .shorthand
376
- branch = self ._repo .branches [self .config .branch ]
377
-
378
- if not branch .is_checked_out ():
379
- self ._repo .checkout (branch )
379
+ self .config .branch = str (self ._repo .active_branch )
380
+ branch = self .config .branch
381
+ self ._repo .head .reference = self ._repo .heads [branch ]
382
+ self ._repo .head .reset (index = True , working_tree = True )
380
383
381
384
remote = self ._find_or_add_remote ()
382
385
self ._update_from_remote (remote , branch )
383
386
384
387
def _clone_repository (self ) -> None :
385
388
kwargs = {}
386
389
if self .config .branch :
387
- kwargs ["checkout_branch " ] = self .config .branch
390
+ kwargs ["branch " ] = self .config .branch
388
391
389
- self ._repo = pygit2 . clone_repository (
392
+ self ._repo = Repo . clone_from (
390
393
self .config .repository_url , self .config .working_directory , ** kwargs
391
394
)
392
395
@@ -398,20 +401,19 @@ def _find_or_add_remote(self):
398
401
break
399
402
400
403
if remote is None :
401
- remote = self ._repo .remotes . create (
402
- "added_by_vulnerablecode" , self .config .repository_url
404
+ remote = self ._repo .create_remote (
405
+ "added_by_vulnerablecode" , url = self .config .repository_url
403
406
)
404
407
405
408
return remote
406
409
407
410
def _update_from_remote (self , remote , branch ) -> None :
408
- progress = remote .fetch ()
409
- if progress . received_objects == 0 :
411
+ fetch_info = remote .fetch ()
412
+ if len ( fetch_info ) == 0 :
410
413
return
411
-
412
- remote_branch = self ._repo .branches [f"{ remote .name } /{ self .config .branch } " ]
413
- branch .set_target (remote_branch .target )
414
- self ._repo .checkout (branch , strategy = pygit2 .GIT_CHECKOUT_FORCE )
414
+ branch = self ._repo .branches [branch ]
415
+ branch .set_reference (remote .refs [branch .name ])
416
+ self ._repo .head .reset (index = True , working_tree = True )
415
417
416
418
417
419
def _include_file (
0 commit comments