Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -14,3 +14,9 @@ dist/metascan: $(wildcard metascan/*.go) $(wildcard metascan/internal/*.go) $(wi
@echo "Building metascan"

dist: dist/runtime-packages dist/metascan

.venv/bin/python:
python3.10 -m venv .venv

dev: .venv/bin/python
.venv/bin/python -m pip install .
44 changes: 40 additions & 4 deletions go-libs/git/maintainers.go
Original file line number Diff line number Diff line change
Expand Up @@ -118,14 +118,13 @@ func (all Commits) Ended() time.Time {
return started
}

func (all Commits) Filter(prefix string) (out Commits) {
// this is a straightforward code ownership detection strategy,
// though we can go berserk with prefix tree for paths
// Filter reduces the history based on the predicate from path
func (all Commits) Filter(predicate func(pathname string) bool) (out Commits) {
for _, c := range all {
stats := []NumStat{}
for _, ns := range c.Stats {
// we don't handle path renames
if !strings.HasPrefix(ns.Pathname, prefix) {
if !predicate(ns.Pathname) {
continue
}
stats = append(stats, ns)
Expand All @@ -144,6 +143,43 @@ func (all Commits) Filter(prefix string) (out Commits) {
return out
}

func (all Commits) LanguageStats() map[string]int {
// this is a straightforward code language detection strategy
stats := map[string]int{}
for _, c := range all {
for _, ns := range c.Stats {
split := strings.Split(ns.Pathname, ".")
ext := split[len(split)-1]
stats[ext] += ns.Added + ns.Deleted
}
if len(stats) == 0 {
continue
}
}
return stats
}

func (all Commits) Language() string {
type lang struct {
Ext string
Changes int
}
var out []lang
for k, v := range all.LanguageStats() {
out = append(out, lang{
Ext: k,
Changes: v,
})
}
if len(out) == 0 {
return "unknown"
}
sort.Slice(out, func(i, j int) bool {
return out[i].Changes > out[j].Changes
})
return out[0].Ext
}

func (all Commits) Authors() (out Authors) {
type tmp struct {
Author, Email string
Expand Down
136 changes: 136 additions & 0 deletions metascan/clone/clone.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,136 @@
package clone

import (
"context"
"fmt"
"path"
"sort"
"strings"

"github.com/databrickslabs/sandbox/go-libs/fileset"
"github.com/databrickslabs/sandbox/go-libs/git"
"github.com/databrickslabs/sandbox/go-libs/github"
"github.com/databrickslabs/sandbox/metascan/inventory"
"github.com/databrickslabs/sandbox/metascan/metadata"
"github.com/yuin/goldmark"
meta "github.com/yuin/goldmark-meta"
"github.com/yuin/goldmark/parser"
"github.com/yuin/goldmark/text"
)

type Clones []*Clone

func (cc Clones) Metadatas(ctx context.Context) (out []metadata.Metadata, err error) {
for _, c := range cc {
m, err := c.Metadatas(ctx)
if err != nil {
return nil, err
}
out = append(out, m...)
}
sort.Slice(out, func(i, j int) bool {
return out[i].LastUpdated.After(out[j].LastUpdated)
})
return out, nil
}

type Clone struct {
Inventory inventory.Item
Git *git.Checkout
Repo github.Repo
FileSet fileset.FileSet
}

func (c *Clone) Name() string {
return fmt.Sprintf("%s/%s", c.Inventory.Org, c.Repo.Name)
}

func (c *Clone) Metadatas(ctx context.Context) ([]metadata.Metadata, error) {
markdown := goldmark.New(
goldmark.WithExtensions(
meta.Meta,
),
goldmark.WithParserOptions(
parser.WithAutoHeadingID(),
),
)
if c.Inventory.IsSandbox {
history, err := c.Git.History(ctx)
if err != nil {
return nil, err
}
out := []metadata.Metadata{}
readmes := c.FileSet.Filter(`README.md`)
for _, readme := range readmes {
folder := path.Dir(readme.Relative)
if folder == "." {
continue
}
subHistory := history.Filter(func(pathname string) bool {
if strings.HasSuffix(pathname, ".md") {
// exclude any documentation
return false
}
return strings.HasPrefix(pathname, folder)
})
authors := subHistory.Authors()
raw, err := readme.Raw()
if err != nil {
return nil, err
}
document := markdown.Parser().Parse(text.NewReader(raw))
doc := document.OwnerDocument()
child := doc.FirstChild()
title := string(child.Text(raw))
if title == "" {
continue
}
if len(authors) == 0 {
continue
}
// todo: need the rest of the readme file
out = append(out, metadata.Metadata{
Title: title,
Author: authors.Primary(),
Language: subHistory.Language(),
Date: subHistory.Started(),
LastUpdated: subHistory.Ended(),
Maturity: c.Inventory.Maturity,
URL: fmt.Sprintf("%s/%s", c.Repo.HtmlURL, folder),
})
}
return out, nil
}
return []metadata.Metadata{{
Title: c.Repo.Description,
Tags: c.Repo.Topics,
Language: c.Repo.Langauge,
Date: c.FileSet.LastUpdated(),
Maturity: c.Inventory.Maturity,
URL: c.Repo.HtmlURL,
}}, nil
}

func (c *Clone) Maintainers(ctx context.Context) ([]string, error) {
history, err := c.Git.History(ctx)
if err != nil {
return nil, err
}
authors := history.Authors()
atMost := 2
if atMost > len(authors) {
atMost = len(authors)
}
// TODO: build up author stats remapper
var out []string
for _, v := range authors {
if v.Email == "action@github.com" {
continue
}
if v.Author == "dependabot[bot]" {
continue
}
out = append(out, v.Author)
}
return out[:atMost], nil
}
45 changes: 45 additions & 0 deletions metascan/clone/clone_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
package clone

import (
"context"
"path/filepath"
"testing"

"github.com/databrickslabs/sandbox/go-libs/env"
"github.com/databrickslabs/sandbox/go-libs/fileset"
"github.com/databrickslabs/sandbox/go-libs/git"
"github.com/databrickslabs/sandbox/go-libs/github"
"github.com/databrickslabs/sandbox/metascan/inventory"
"github.com/stretchr/testify/require"
)

func TestDiscoversSandbox(t *testing.T) {
ctx := context.Background()
home, _ := env.UserHomeDir(ctx)
dir := filepath.Join(home, ".databricks/labs/metascan/cache/databricks/terraform-databricks-examples")

fs, err := fileset.RecursiveChildren(dir)
require.NoError(t, err)

checkout, err := git.NewCheckout(ctx, dir)
require.NoError(t, err)

clone := &Clone{
Inventory: inventory.Item{
Org: "databricks",
Repo: "terraform-databricks-examples",
IsSandbox: true,
},
Repo: github.Repo{
Name: "terraform-databricks-examples",
Topics: []string{"terraform", "modules"},
},
Git: checkout,
FileSet: fs,
}

metadatas, err := clone.Metadatas(ctx)
require.NoError(t, err)

require.True(t, len(metadatas) > 0)
}
11 changes: 9 additions & 2 deletions metascan/internal/clones.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,13 @@ import (
"fmt"
"path"
"sort"
"strings"

"github.com/databrickslabs/sandbox/go-libs/fileset"
"github.com/databrickslabs/sandbox/go-libs/git"
"github.com/databrickslabs/sandbox/go-libs/github"
"github.com/yuin/goldmark"
"github.com/yuin/goldmark-meta"
meta "github.com/yuin/goldmark-meta"
"github.com/yuin/goldmark/parser"
"github.com/yuin/goldmark/text"
)
Expand Down Expand Up @@ -64,7 +65,13 @@ func (c Clone) Metadatas(ctx context.Context) ([]Metadata, error) {
folder := path.Dir(readme.Relative)

subFileset := c.FileSet.Filter(folder)
subHistory := history.Filter(folder)
subHistory := history.Filter(func(pathname string) bool {
if strings.HasSuffix(pathname, ".md") {
// exclude any documentation
return false
}
return strings.HasPrefix(pathname, folder)
})
authors := subHistory.Authors()

raw, err := readme.Raw()
Expand Down
Loading