Skip to content

Commit

Permalink
feat: add rename statistics
Browse files Browse the repository at this point in the history
  • Loading branch information
nielskrijger committed Feb 2, 2020
1 parent 345dc35 commit 96e24bd
Show file tree
Hide file tree
Showing 8 changed files with 308 additions and 63 deletions.
9 changes: 3 additions & 6 deletions cmd/root.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,19 +23,16 @@ var (
parser := internal.NewParser()
for i, arg := range os.Args {
if i > 0 {
fmt.Printf("\nstart processing %q\n", arg)
err := parser.ParseProject(arg)
check(err)
fmt.Printf("\nprocessing %q took %s\n", arg, time.Since(start).Round(time.Millisecond))
}
}
if len(os.Args) >= 3 {
fmt.Printf("\ntotal processing time was %s", time.Since(start).Round(time.Millisecond))
}
res, _ := json.Marshal(parser)
err := ioutil.WriteFile(outputFile, res, 0644)
fmt.Printf("\nwriting output to %q", outputFile)
err := ioutil.WriteFile(outputFile, res, 0644)
check(err)

fmt.Printf("\ntotal processing time was %s", time.Since(start).Round(time.Millisecond))
},
}
)
Expand Down
1 change: 0 additions & 1 deletion go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@ go 1.12
require (
github.com/gliderlabs/ssh v0.2.2 // indirect
github.com/google/go-cmp v0.3.0 // indirect
github.com/pkg/errors v0.8.1
github.com/spf13/cobra v0.0.5
github.com/spf13/pflag v1.0.5 // indirect
github.com/stretchr/testify v1.4.0 // indirect
Expand Down
50 changes: 17 additions & 33 deletions internal/commit.go
Original file line number Diff line number Diff line change
@@ -1,46 +1,30 @@
package internal

import (
"github.com/pkg/errors"
"gopkg.in/src-d/go-git.v4/plumbing/object"
"strings"
)

type Commit struct {
Hash string `json:"hash"`
Author Signature `json:"author"`
Committer Signature `json:"committer"`
Message string `json:"message"`
Files []*FileChange `json:"files"`
IsMerge bool `json:"isMerge"`
Hash string `json:"hash"`
Author Signature `json:"author"`
Committer Signature `json:"committer"`
Message string `json:"message"`
FileChanges FileChanges `json:"files"`
IsMerge bool `json:"isMerge"`

originalCommit *object.Commit
}

func NewCommit(c *object.Commit) (*Commit, error) {
r := &Commit{
Hash: c.Hash.String(),
Author: NewSignature(c.Author),
Committer: NewSignature(c.Committer),
Message: strings.TrimSpace(c.Message),
Files: make([]*FileChange, 0),
IsMerge: len(c.ParentHashes) > 1,
}

// get file stats, these will be added later
stats, err := c.Stats()
if err != nil {
return nil, errors.Wrap(err, "failed to retrieve stats")
}
func NewCommit(c *object.Commit) *Commit {
return &Commit{
Hash: c.Hash.String(),
Author: NewSignature(c.Author),
Committer: NewSignature(c.Committer),
FileChanges: make(FileChanges, 0),
Message: strings.TrimSpace(c.Message),
IsMerge: len(c.ParentHashes) > 1,

// loop through all files and store metadata in a "FileChange"
// find stat for this file and add change
for _, stat := range stats {
fc := NewFileChange(stat.Name)

fc.Additions = stat.Addition
fc.Deletions = stat.Deletion

r.Files = append(r.Files, fc)
originalCommit: c,
}

return r, nil
}
58 changes: 58 additions & 0 deletions internal/commits.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
package internal

import (
"errors"
"gopkg.in/src-d/go-git.v4/plumbing/object"
)

type Commits []*Commit

func (c Commits) ParseFileChanges() error {
for _, commit := range c {
currentTree, err := commit.originalCommit.Tree()
if err != nil {
return err
}

toTree := &object.Tree{}
if commit.originalCommit.NumParents() > 0 {
// Only compare with first parent, same as go-git's patch.Stats()
firstParent, err := commit.originalCommit.Parents().Next()
if err != nil {
return err
}
if firstParent == nil {
return errors.New("unable to find parent")
}

parentCommit := c.Find(firstParent.Hash.String())

toTree, err = parentCommit.originalCommit.Tree()
if err != nil {
return err
}
}

changes, err := currentTree.Diff(toTree)
if err != nil {
return err
}

fcs, err := NewFileChanges(changes)
if err != nil {
return err
}

commit.FileChanges = fcs
}
return nil
}

func (c Commits) Find(hash string) *Commit {
for _, commit := range c {
if commit.Hash == hash {
return commit
}
}
return nil
}
50 changes: 43 additions & 7 deletions internal/file_change.go
Original file line number Diff line number Diff line change
@@ -1,14 +1,50 @@
package internal

import (
"gopkg.in/src-d/go-git.v4/plumbing/format/diff"
"strings"
)

type FileChange struct {
Name string `json:"filepath"`
IsBinary bool `json:"isBinary"`
Additions int `json:"additions"`
Deletions int `json:"deletions"`
Name string `json:"filepath"`
IsBinary bool `json:"isBinary"`
Additions int `json:"additions"`
Deletions int `json:"deletions"`
RawAdditions int `json:"rawAdditions"`
RawDeletions int `json:"rawDeletions"`
RenameFrom string `json:"renameOf,omitempty"`
RenameTo string `json:"renameTo,omitempty"`
Similarity float32 `json:"similarity,omitempty"`
}

func NewFileChange(path string) *FileChange {
return &FileChange{
Name: path,
// Based heavily on:
// https://github.com/src-d/go-git/blob/d6c4b113c17a011530e93f179b7ac27eb3f17b9b/plumbing/object/patch.go
func NewFileChange(name string, fp diff.FilePatch) *FileChange {
fc := &FileChange{Name: name, IsBinary: fp.IsBinary()}

for _, chunk := range fp.Chunks() {
s := chunk.Content()
if len(s) == 0 {
continue
}

switch chunk.Type() {
case diff.Add:
fc.Additions += strings.Count(s, "\n")
if s[len(s)-1] != '\n' {
fc.Additions++
}
case diff.Delete:
fc.Deletions += strings.Count(s, "\n")
if s[len(s)-1] != '\n' {
fc.Deletions++
}
}
}

// Additions & Deletions are mutable, Raw* are not after this
fc.RawAdditions = fc.Additions
fc.RawDeletions = fc.Deletions

return fc
}
161 changes: 161 additions & 0 deletions internal/file_changes.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,161 @@
package internal

import (
"gopkg.in/src-d/go-git.v4/plumbing/object"
"strings"
)

type FileChanges []*FileChange

const (
// RenameThreshold specifies the percentage of removed lines that
// still exist in destination to consider them linked.
RenameThreshold = 40
)

type changedFile struct {
filepath string
content string
}

func NewFileChanges(changes object.Changes) (FileChanges, error) {
newFiles := make([]changedFile, 0)
deletedFiles := make([]changedFile, 0)

fcs := make(FileChanges, 0)

// Extracts raw addition/deletion stats and looks for any new/deleted files
for _, c := range changes {
patch, err := c.Patch()
if err != nil {
return nil, err
}

for _, fp := range patch.FilePatches() {
// ignore empty patches (binary files, submodule refs updates)
if len(fp.Chunks()) == 0 {
continue
}

from, to := fp.Files()
name := ""
if from == nil {
name = to.Path()
newFiles = append(newFiles, changedFile{
filepath: to.Path(),
content: fp.Chunks()[0].Content(),
})
} else if to == nil {
name = from.Path()
deletedFiles = append(deletedFiles, changedFile{
filepath: from.Path(),
content: fp.Chunks()[0].Content(),
})
} else {
name = from.Path()
}
fcs = append(fcs, NewFileChange(name, fp))
}
}

renames := findRenames(newFiles, deletedFiles)
for _, rename := range renames {
for _, file := range fcs {
if file.Name == rename.RenameFrom {
file.Additions = 0
file.Deletions = 0
file.RenameTo = rename.RenameTo
file.Similarity = rename.Similarity
} else if file.Name == rename.Name {
file.Additions = rename.Additions
file.Deletions = rename.Deletions
file.RenameFrom = rename.RenameFrom
file.Similarity = rename.Similarity
}
}
}

return fcs, nil
}

func findRenames(newFiles, deletedFiles []changedFile) FileChanges {
renames := make(FileChanges, 0)

OUTER:
for _, deletedFile := range deletedFiles {
// First try to Find identical matches. This efficiently limits the
// number of comparisons later.
for i, newFile := range newFiles {
if deletedFile.content == newFile.content {

// Delete file from array so we dont' process it twice
length := len(newFiles)
newFiles[length-1], newFiles[i] = newFiles[i], newFiles[length-1]
newFiles = newFiles[:length-1]

renames = append(renames, &FileChange{
Name: newFile.filepath,
IsBinary: false,
Additions: 0,
Deletions: 0,
RenameTo: newFile.filepath,
RenameFrom: deletedFile.filepath,
Similarity: 100.0,
})
continue OUTER
}
}

// Otherwise start comparing all lines
deletedLines := splitLines(deletedFile.content)
deletedLinesCount := len(deletedLines)
var highestMatch *FileChange
for _, newFile := range newFiles {
similarLines := 0
addedLines := splitLines(newFile.content)
addedLinesCount := len(addedLines)
for _, deletedLine := range deletedLines {
for i, addedLine := range addedLines {
if addedLine == deletedLine {
addedLines = append(addedLines[:i], addedLines[i+1:]...)
similarLines += 1
break
}
}
}

similarity := float32(similarLines) / float32(maxInt(addedLinesCount, deletedLinesCount)) * 100.0

if similarity > RenameThreshold && (highestMatch == nil || similarity > highestMatch.Similarity) {
// TODO check if newFile is already being used in another rename.
// If so compare similarity %. When higher it's the new match and
// re-queue the deleted file. When lower skip.
highestMatch = &FileChange{
Name: newFile.filepath,
IsBinary: false,
Additions: addedLinesCount - similarLines,
Deletions: deletedLinesCount - similarLines,
RenameTo: newFile.filepath,
RenameFrom: deletedFile.filepath,
Similarity: similarity,
}
}
}

if highestMatch != nil {
renames = append(renames, highestMatch)
}
}
return renames
}

func maxInt(x, y int) int {
if x < y {
return y
}
return x
}

func splitLines(content string) []string {
return strings.Split(strings.Replace(content, "\r\n", "\n", -1), "\n")
}
Loading

0 comments on commit 96e24bd

Please sign in to comment.