From 96e24bd6fd10a33ebf02e7d3e12a73d5f69da05d Mon Sep 17 00:00:00 2001 From: Niels Krijger Date: Sun, 2 Feb 2020 20:25:07 +0100 Subject: [PATCH] feat: add rename statistics --- cmd/root.go | 9 +-- go.mod | 1 - internal/commit.go | 50 +++++------- internal/commits.go | 58 ++++++++++++++ internal/file_change.go | 50 ++++++++++-- internal/file_changes.go | 161 +++++++++++++++++++++++++++++++++++++++ internal/parser.go | 19 ++++- internal/project.go | 23 +++--- 8 files changed, 308 insertions(+), 63 deletions(-) create mode 100644 internal/commits.go create mode 100644 internal/file_changes.go diff --git a/cmd/root.go b/cmd/root.go index e504465..cbacfa3 100644 --- a/cmd/root.go +++ b/cmd/root.go @@ -23,19 +23,16 @@ var ( parser := internal.NewParser() for i, arg := range os.Args { if i > 0 { - fmt.Printf("\nstart processing %q\n", arg) err := parser.ParseProject(arg) check(err) - fmt.Printf("\nprocessing %q took %s\n", arg, time.Since(start).Round(time.Millisecond)) } } - if len(os.Args) >= 3 { - fmt.Printf("\ntotal processing time was %s", time.Since(start).Round(time.Millisecond)) - } res, _ := json.Marshal(parser) - err := ioutil.WriteFile(outputFile, res, 0644) fmt.Printf("\nwriting output to %q", outputFile) + err := ioutil.WriteFile(outputFile, res, 0644) check(err) + + fmt.Printf("\ntotal processing time was %s", time.Since(start).Round(time.Millisecond)) }, } ) diff --git a/go.mod b/go.mod index bb15d91..8fc3039 100644 --- a/go.mod +++ b/go.mod @@ -5,7 +5,6 @@ go 1.12 require ( github.com/gliderlabs/ssh v0.2.2 // indirect github.com/google/go-cmp v0.3.0 // indirect - github.com/pkg/errors v0.8.1 github.com/spf13/cobra v0.0.5 github.com/spf13/pflag v1.0.5 // indirect github.com/stretchr/testify v1.4.0 // indirect diff --git a/internal/commit.go b/internal/commit.go index 9ae745c..a36367f 100644 --- a/internal/commit.go +++ b/internal/commit.go @@ -1,46 +1,30 @@ package internal import ( - "github.com/pkg/errors" "gopkg.in/src-d/go-git.v4/plumbing/object" "strings" ) type Commit struct { - Hash string `json:"hash"` - Author Signature `json:"author"` - Committer Signature `json:"committer"` - Message string `json:"message"` - Files []*FileChange `json:"files"` - IsMerge bool `json:"isMerge"` + Hash string `json:"hash"` + Author Signature `json:"author"` + Committer Signature `json:"committer"` + Message string `json:"message"` + FileChanges FileChanges `json:"files"` + IsMerge bool `json:"isMerge"` + + originalCommit *object.Commit } -func NewCommit(c *object.Commit) (*Commit, error) { - r := &Commit{ - Hash: c.Hash.String(), - Author: NewSignature(c.Author), - Committer: NewSignature(c.Committer), - Message: strings.TrimSpace(c.Message), - Files: make([]*FileChange, 0), - IsMerge: len(c.ParentHashes) > 1, - } - - // get file stats, these will be added later - stats, err := c.Stats() - if err != nil { - return nil, errors.Wrap(err, "failed to retrieve stats") - } +func NewCommit(c *object.Commit) *Commit { + return &Commit{ + Hash: c.Hash.String(), + Author: NewSignature(c.Author), + Committer: NewSignature(c.Committer), + FileChanges: make(FileChanges, 0), + Message: strings.TrimSpace(c.Message), + IsMerge: len(c.ParentHashes) > 1, - // loop through all files and store metadata in a "FileChange" - // find stat for this file and add change - for _, stat := range stats { - fc := NewFileChange(stat.Name) - - fc.Additions = stat.Addition - fc.Deletions = stat.Deletion - - r.Files = append(r.Files, fc) + originalCommit: c, } - - return r, nil } diff --git a/internal/commits.go b/internal/commits.go new file mode 100644 index 0000000..3c2414f --- /dev/null +++ b/internal/commits.go @@ -0,0 +1,58 @@ +package internal + +import ( + "errors" + "gopkg.in/src-d/go-git.v4/plumbing/object" +) + +type Commits []*Commit + +func (c Commits) ParseFileChanges() error { + for _, commit := range c { + currentTree, err := commit.originalCommit.Tree() + if err != nil { + return err + } + + toTree := &object.Tree{} + if commit.originalCommit.NumParents() > 0 { + // Only compare with first parent, same as go-git's patch.Stats() + firstParent, err := commit.originalCommit.Parents().Next() + if err != nil { + return err + } + if firstParent == nil { + return errors.New("unable to find parent") + } + + parentCommit := c.Find(firstParent.Hash.String()) + + toTree, err = parentCommit.originalCommit.Tree() + if err != nil { + return err + } + } + + changes, err := currentTree.Diff(toTree) + if err != nil { + return err + } + + fcs, err := NewFileChanges(changes) + if err != nil { + return err + } + + commit.FileChanges = fcs + } + return nil +} + +func (c Commits) Find(hash string) *Commit { + for _, commit := range c { + if commit.Hash == hash { + return commit + } + } + return nil +} diff --git a/internal/file_change.go b/internal/file_change.go index f0c2ff1..6868eac 100644 --- a/internal/file_change.go +++ b/internal/file_change.go @@ -1,14 +1,50 @@ package internal +import ( + "gopkg.in/src-d/go-git.v4/plumbing/format/diff" + "strings" +) + type FileChange struct { - Name string `json:"filepath"` - IsBinary bool `json:"isBinary"` - Additions int `json:"additions"` - Deletions int `json:"deletions"` + Name string `json:"filepath"` + IsBinary bool `json:"isBinary"` + Additions int `json:"additions"` + Deletions int `json:"deletions"` + RawAdditions int `json:"rawAdditions"` + RawDeletions int `json:"rawDeletions"` + RenameFrom string `json:"renameOf,omitempty"` + RenameTo string `json:"renameTo,omitempty"` + Similarity float32 `json:"similarity,omitempty"` } -func NewFileChange(path string) *FileChange { - return &FileChange{ - Name: path, +// Based heavily on: +// https://github.com/src-d/go-git/blob/d6c4b113c17a011530e93f179b7ac27eb3f17b9b/plumbing/object/patch.go +func NewFileChange(name string, fp diff.FilePatch) *FileChange { + fc := &FileChange{Name: name, IsBinary: fp.IsBinary()} + + for _, chunk := range fp.Chunks() { + s := chunk.Content() + if len(s) == 0 { + continue + } + + switch chunk.Type() { + case diff.Add: + fc.Additions += strings.Count(s, "\n") + if s[len(s)-1] != '\n' { + fc.Additions++ + } + case diff.Delete: + fc.Deletions += strings.Count(s, "\n") + if s[len(s)-1] != '\n' { + fc.Deletions++ + } + } } + + // Additions & Deletions are mutable, Raw* are not after this + fc.RawAdditions = fc.Additions + fc.RawDeletions = fc.Deletions + + return fc } diff --git a/internal/file_changes.go b/internal/file_changes.go new file mode 100644 index 0000000..48939a3 --- /dev/null +++ b/internal/file_changes.go @@ -0,0 +1,161 @@ +package internal + +import ( + "gopkg.in/src-d/go-git.v4/plumbing/object" + "strings" +) + +type FileChanges []*FileChange + +const ( + // RenameThreshold specifies the percentage of removed lines that + // still exist in destination to consider them linked. + RenameThreshold = 40 +) + +type changedFile struct { + filepath string + content string +} + +func NewFileChanges(changes object.Changes) (FileChanges, error) { + newFiles := make([]changedFile, 0) + deletedFiles := make([]changedFile, 0) + + fcs := make(FileChanges, 0) + + // Extracts raw addition/deletion stats and looks for any new/deleted files + for _, c := range changes { + patch, err := c.Patch() + if err != nil { + return nil, err + } + + for _, fp := range patch.FilePatches() { + // ignore empty patches (binary files, submodule refs updates) + if len(fp.Chunks()) == 0 { + continue + } + + from, to := fp.Files() + name := "" + if from == nil { + name = to.Path() + newFiles = append(newFiles, changedFile{ + filepath: to.Path(), + content: fp.Chunks()[0].Content(), + }) + } else if to == nil { + name = from.Path() + deletedFiles = append(deletedFiles, changedFile{ + filepath: from.Path(), + content: fp.Chunks()[0].Content(), + }) + } else { + name = from.Path() + } + fcs = append(fcs, NewFileChange(name, fp)) + } + } + + renames := findRenames(newFiles, deletedFiles) + for _, rename := range renames { + for _, file := range fcs { + if file.Name == rename.RenameFrom { + file.Additions = 0 + file.Deletions = 0 + file.RenameTo = rename.RenameTo + file.Similarity = rename.Similarity + } else if file.Name == rename.Name { + file.Additions = rename.Additions + file.Deletions = rename.Deletions + file.RenameFrom = rename.RenameFrom + file.Similarity = rename.Similarity + } + } + } + + return fcs, nil +} + +func findRenames(newFiles, deletedFiles []changedFile) FileChanges { + renames := make(FileChanges, 0) + +OUTER: + for _, deletedFile := range deletedFiles { + // First try to Find identical matches. This efficiently limits the + // number of comparisons later. + for i, newFile := range newFiles { + if deletedFile.content == newFile.content { + + // Delete file from array so we dont' process it twice + length := len(newFiles) + newFiles[length-1], newFiles[i] = newFiles[i], newFiles[length-1] + newFiles = newFiles[:length-1] + + renames = append(renames, &FileChange{ + Name: newFile.filepath, + IsBinary: false, + Additions: 0, + Deletions: 0, + RenameTo: newFile.filepath, + RenameFrom: deletedFile.filepath, + Similarity: 100.0, + }) + continue OUTER + } + } + + // Otherwise start comparing all lines + deletedLines := splitLines(deletedFile.content) + deletedLinesCount := len(deletedLines) + var highestMatch *FileChange + for _, newFile := range newFiles { + similarLines := 0 + addedLines := splitLines(newFile.content) + addedLinesCount := len(addedLines) + for _, deletedLine := range deletedLines { + for i, addedLine := range addedLines { + if addedLine == deletedLine { + addedLines = append(addedLines[:i], addedLines[i+1:]...) + similarLines += 1 + break + } + } + } + + similarity := float32(similarLines) / float32(maxInt(addedLinesCount, deletedLinesCount)) * 100.0 + + if similarity > RenameThreshold && (highestMatch == nil || similarity > highestMatch.Similarity) { + // TODO check if newFile is already being used in another rename. + // If so compare similarity %. When higher it's the new match and + // re-queue the deleted file. When lower skip. + highestMatch = &FileChange{ + Name: newFile.filepath, + IsBinary: false, + Additions: addedLinesCount - similarLines, + Deletions: deletedLinesCount - similarLines, + RenameTo: newFile.filepath, + RenameFrom: deletedFile.filepath, + Similarity: similarity, + } + } + } + + if highestMatch != nil { + renames = append(renames, highestMatch) + } + } + return renames +} + +func maxInt(x, y int) int { + if x < y { + return y + } + return x +} + +func splitLines(content string) []string { + return strings.Split(strings.Replace(content, "\r\n", "\n", -1), "\n") +} diff --git a/internal/parser.go b/internal/parser.go index 36078c8..73995be 100644 --- a/internal/parser.go +++ b/internal/parser.go @@ -1,28 +1,39 @@ package internal +import ( + "fmt" + "time" +) + type Parser struct { - Version string `json:"version"` - Projects []*Project `json:"projects"` + Version string `json:"version"` + Projects []*Project `json:"projects"` } func NewParser() *Parser { return &Parser{ - Version: "0.1.0", + Version: "1.0.0", Projects: make([]*Project, 0), } } func (p *Parser) ParseProject(filepath string) error { + + fmt.Printf("processing %q\n", filepath) project, err := NewProject(filepath) if err != nil { return err } + start := time.Now() + fmt.Print("parsing commits...") err = project.ParseCommits() if err != nil { return err } + fmt.Printf(" done (%v)\n", time.Since(start).Round(time.Millisecond)) p.Projects = append(p.Projects, project) + return nil -} \ No newline at end of file +} diff --git a/internal/project.go b/internal/project.go index fa15a7a..d369481 100644 --- a/internal/project.go +++ b/internal/project.go @@ -1,7 +1,6 @@ package internal import ( - "fmt" "gopkg.in/src-d/go-billy.v4/osfs" "gopkg.in/src-d/go-git.v4" "gopkg.in/src-d/go-git.v4/plumbing/cache" @@ -12,8 +11,8 @@ import ( ) type Project struct { - Name string `json:"name"` - Commits []*Commit `json:"commits"` + Name string `json:"name"` + Commits Commits `json:"commits"` filepath string } @@ -25,8 +24,8 @@ func NewProject(path string) (*Project, error) { } return &Project{ - Name: name, - Commits: make([]*Commit, 0), + Name: name, + Commits: make(Commits, 0), filepath: path, }, nil } @@ -62,15 +61,15 @@ func (p *Project) ParseCommits() error { } // ... iterate over the commits - return cIter.ForEach(func(c *object.Commit) error { - fmt.Print(".") - commit, err := NewCommit(c) - if err != nil { - return err - } - p.Commits = append(p.Commits, commit) + err = cIter.ForEach(func(c *object.Commit) error { + p.Commits = append(p.Commits, NewCommit(c)) return nil }) + if err != nil { + return err + } + + return p.Commits.ParseFileChanges() } func projectName(fp string) (string, error) {