Skip to content

Commit

Permalink
Merge pull request kubernetes-sigs#2172 from haiyanmeng/stats
Browse files Browse the repository at this point in the history
Several improvements on crawler
  • Loading branch information
monopole authored Feb 3, 2020
2 parents c626eae + 3ebeeba commit c683e6a
Show file tree
Hide file tree
Showing 14 changed files with 361 additions and 173 deletions.
2 changes: 1 addition & 1 deletion api/internal/crawl/cmd/crawler/crawler.go
Original file line number Diff line number Diff line change
Expand Up @@ -167,7 +167,7 @@ func main() {
it := idx.IterateQuery(query, 10000, 60*time.Second)
for it.Next() {
for _, hit := range it.Value().Hits.Hits {
seedDocs = append(seedDocs, hit.Document.Copy())
seedDocs = append(seedDocs, hit.Document.Document.Copy())
}
}
if err := it.Err(); err != nil {
Expand Down
193 changes: 81 additions & 112 deletions api/internal/crawl/cmd/kustomize_stats/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,16 +2,13 @@ package main

import (
"context"
"crypto/sha256"
"flag"
"fmt"
"log"
"net/http"
"os"
"sort"
"time"

"sigs.k8s.io/kustomize/api/internal/crawl/crawler/github"

"sigs.k8s.io/kustomize/api/internal/crawl/doc"

"sigs.k8s.io/kustomize/api/internal/crawl/index"
Expand All @@ -34,9 +31,9 @@ func iterateArr(arr []string, countMap map[string]int) {

}

// SortMapKeyByValue takes a map as its input, sorts its keys according to their values
// SortMapKeyByValueInt takes a map as its input, sorts its keys according to their values
// in the map, and outputs the sorted keys as a slice.
func SortMapKeyByValue(m map[string]int) []string {
func SortMapKeyByValueInt(m map[string]int) []string {
keys := make([]string, 0, len(m))
for key := range m {
keys = append(keys, key)
Expand All @@ -46,56 +43,58 @@ func SortMapKeyByValue(m map[string]int) []string {
return keys
}

func GeneratorOrTransformerStats(ctx context.Context,
docs []*doc.Document, isGenerator bool, idx *index.KustomizeIndex) {
// SortMapKeyByValue takes a map as its input, sorts its keys according to their values
// in the map, and outputs the sorted keys as a slice.
func SortMapKeyByValueLen(m map[string][]string) []string {
keys := make([]string, 0, len(m))
for key := range m {
keys = append(keys, key)
}
// sort keys according to their values in the map m
sort.Slice(keys, func(i, j int) bool { return len(m[keys[i]]) > len(m[keys[j]]) })
return keys
}

fieldName := "generators"
if !isGenerator {
fieldName = "transformers"
func GeneratorOrTransformerStats(docs []*doc.KustomizationDocument) {
n := len(docs)
if n == 0 {
return
}

// allReferredDocs includes all the documents referred in the field
allReferredDocs := doc.NewUniqueDocuments()
fileType := docs[0].FileType
fmt.Printf("There are totally %d %s files.\n", n, fileType)

GitRepositorySummary(docs, fileType)

// docUsingGeneratorCount counts the number of the kustomization files using generators or transformers
docCount := 0
// key of kindToUrls: a string in the KustomizationDocument.Kinds field
// value of kindToUrls: a slice of string urls defining a given kind.
kindToUrls := make(map[string][]string)

// collect all the documents referred in the field
for _, d := range docs {
kdoc := doc.KustomizationDocument{
Document: *d,
}
referredDocs, err := kdoc.GetResources(false, !isGenerator, isGenerator)
if err != nil {
log.Printf("failed to parse the %s field of the Document (%s): %v",
fieldName, d.Path(), err)
url := fmt.Sprintf("%s/blob/%s/%s", d.RepositoryURL, d.DefaultBranch, d.FilePath)
for _, kind := range d.Kinds {
if _, ok := kindToUrls[kind]; !ok {
kindToUrls[kind] = []string{url}
} else {
kindToUrls[kind] = append(kindToUrls[kind], url)
}
}
if len(referredDocs) > 0 {
docCount++
allReferredDocs.AddDocuments(referredDocs)
}
fmt.Printf("There are totally %d kinds of %s\n", len(kindToUrls), fileType)
sortedKeys := SortMapKeyByValueLen(kindToUrls)
for _, k := range sortedKeys {
sort.Strings(kindToUrls[k])
fmt.Printf("%s kind %s appears %d times\n", fileType, k, len(kindToUrls[k]))
for _, url := range kindToUrls[k] {
fmt.Printf("%s\n", url)
}
}

fileCount, dirCount, fileTypeDocs, dirTypeDocs := DocumentTypeSummary(ctx, allReferredDocs.Documents())

// check whether any of the files are not in the index
nonExistFileCount := ExistInIndex(idx, fileTypeDocs, fieldName + " file ")
// check whether any of the dirs are not in the index
nonExistDirCount := ExistInIndex(idx, dirTypeDocs, fieldName + " dir ")

GitRepositorySummary(fileTypeDocs, fieldName + " files")
GitRepositorySummary(dirTypeDocs, fieldName + " dirs")

fmt.Printf("%d kustomization files use %s: %d %s are files and %d %s are dirs.\n",
docCount, fieldName, fileCount, fieldName, dirCount, fieldName)
fmt.Printf("%d %s files do not exist in the index\n", nonExistFileCount, fieldName)
fmt.Printf("%d %s dirs do not exist in the index\n", nonExistDirCount, fieldName)
}

// GitRepositorySummary counts the distribution of docs:
// 1) how many git repositories are these docs from?
// 2) how many docs are from each git repository?
func GitRepositorySummary(docs []*doc.Document, msgPrefix string) {
func GitRepositorySummary(docs []*doc.KustomizationDocument, fileType string) {
m := make(map[string]int)
for _, d := range docs {
if _, ok := m[d.RepositoryURL]; ok {
Expand All @@ -104,65 +103,16 @@ func GitRepositorySummary(docs []*doc.Document, msgPrefix string) {
m[d.RepositoryURL] = 1
}
}
sortedKeys := SortMapKeyByValue(m)
sortedKeys := SortMapKeyByValueInt(m)
topN := 10
i := 0
for _, k := range sortedKeys {
fmt.Printf("%d %s are from %s\n", m[k], msgPrefix, k)
}
}

// ExistInIndex goes through each Document in docs, and check whether it is in the index or not.
// It returns the number of documents which does not exist in the index.
func ExistInIndex(idx *index.KustomizeIndex, docs []*doc.Document, msgPrefix string) int {
nonExistCount := 0
for _, d := range docs {
exists, err := idx.Exists(d.ID())
if err != nil {
log.Println(err)
}
if !exists {
log.Printf("%s (%s) does not exist in the index", msgPrefix, d.Path())
nonExistCount++
}
}
return nonExistCount
}

// DocumentTypeSummary goes through each doc in docs, and determines whether it is a file or dir.
func DocumentTypeSummary(ctx context.Context, docs []*doc.Document) (
fileCount, dirCount int, files, dirs []*doc.Document) {
githubToken := os.Getenv(githubAccessTokenVar)
if githubToken == "" {
log.Fatalf("Must set the variable '%s' to make github requests.\n",
githubAccessTokenVar)
}
ghCrawler := github.NewCrawler(githubToken, retryCount, &http.Client{}, github.QueryWith())

for _, d := range docs {
oldFilePath := d.FilePath
if err := ghCrawler.FetchDocument(ctx, d); err != nil {
log.Printf("FetchDocument failed on %s: %v", d.Path(), err)
continue
}

if d.FilePath == oldFilePath {
fileCount++
files = append(files, d)
} else {
dirCount++
dirs = append(dirs, d)
}
}
return fileCount, dirCount, files, dirs
}

// ExistInSlice checks where target exits in items.
func ExistInSlice(items []string, target string) bool {
for _, item := range items {
if item == target {
return true
if i >= topN {
break
}
fmt.Printf("%d %s are from %s\n", m[k], fileType, k)
i++
}
return false
}

func main() {
Expand Down Expand Up @@ -204,17 +154,26 @@ If you only want to list the 10 most popular features, set the flag to 10.`)
// ids tracks the unique IDs of the documents in the index
ids := make(map[string]struct{})

// generatorDocs includes all the docs using generators
generatorDocs := make([]*doc.Document, 0)
// generatorFiles include all the non-kustomization files whose FileType is generator
generatorFiles := make([]*doc.KustomizationDocument, 0)

// transformersDocs includes all the docs using transformers
transformersDocs := make([]*doc.Document, 0)
// transformersFiles include all the non-kustomization files whose FileType is transformer
transformersFiles := make([]*doc.KustomizationDocument, 0)

checksums := make(map[string]int)

// get all the documents in the index
query := []byte(`{ "query":{ "match_all":{} } }`)
it := idx.IterateQuery(query, 10000, 60*time.Second)
for it.Next() {
for _, hit := range it.Value().Hits.Hits {
sum := fmt.Sprintf("%x", sha256.Sum256([]byte(hit.Document.DocumentData)))
if _, ok := checksums[sum]; ok {
checksums[sum]++
} else {
checksums[sum] = 1
}

// check whether there is any duplicate IDs in the index
if _, ok := ids[hit.ID]; !ok {
ids[hit.ID] = struct{}{}
Expand All @@ -229,11 +188,13 @@ If you only want to list the 10 most popular features, set the flag to 10.`)
if doc.IsKustomizationFile(hit.Document.FilePath) {
kustomizationFilecount++
iterateArr(hit.Document.Identifiers, kustomizeIdentifiersMap)
if ExistInSlice(hit.Document.Identifiers, "generators") {
generatorDocs = append(generatorDocs, hit.Document.Copy())
}
if ExistInSlice(hit.Document.Identifiers, "transformers") {
transformersDocs = append(transformersDocs, hit.Document.Copy())

} else {
switch hit.Document.FileType {
case "generator":
generatorFiles = append(generatorFiles, hit.Document.Copy())
case "transformer":
transformersFiles = append(transformersFiles, hit.Document.Copy())
}
}
}
Expand All @@ -243,9 +204,9 @@ If you only want to list the 10 most popular features, set the flag to 10.`)
log.Fatalf("Error iterating: %v\n", err)
}

sortedKindsMapKeys := SortMapKeyByValue(kindsMap)
sortedIdentifiersMapKeys := SortMapKeyByValue(identifiersMap)
sortedKustomizeIdentifiersMapKeys := SortMapKeyByValue(kustomizeIdentifiersMap)
sortedKindsMapKeys := SortMapKeyByValueInt(kindsMap)
sortedIdentifiersMapKeys := SortMapKeyByValueInt(identifiersMap)
sortedKustomizeIdentifiersMapKeys := SortMapKeyByValueInt(kustomizeIdentifiersMap)

fmt.Printf(`The count of unique document IDs in the kustomize index: %d
There are %d documents in the kustomize index.
Expand Down Expand Up @@ -280,6 +241,14 @@ There are %d documents in the kustomize index.
}
}

GeneratorOrTransformerStats(ctx, generatorDocs, true, idx)
GeneratorOrTransformerStats(ctx, transformersDocs, false, idx)
GeneratorOrTransformerStats(generatorFiles)
GeneratorOrTransformerStats(transformersFiles)

fmt.Printf("There are total %d checksums of document contents\n", len(checksums))
sortedChecksums := SortMapKeyByValueInt(checksums)
sortedChecksums = sortedChecksums[:20]
fmt.Printf("The top 20 checksums are:\n")
for _, key := range sortedChecksums {
fmt.Printf("checksum %s apprears %d\n", key, checksums[key])
}
}
9 changes: 6 additions & 3 deletions api/internal/crawl/config/crawler/cronjob/cronjob.yaml
Original file line number Diff line number Diff line change
@@ -1,17 +1,20 @@
apiVersion: batch/v1beta1
kind: CronJob
metadata:
name: crawler
name: crawler-cronjob
spec:
schedule: "5 0 * * */1"
# run the cronjob at 00:00 every 7 days
schedule: "0 0 */7 * *"
jobTemplate:
spec:
template:
spec:
restartPolicy: OnFailure
containers:
- name: crawler
image: gcr.io/kustomize-search/crawler:latest
image: gcr.io/haiyanmeng-gke-dev/crawler:v1
command: ["/crawler"]
args: ["--mode=index+github", "--github-repo=kubernetes-sigs/kustomize", "--index=kustomize"]
imagePullPolicy: Always
env:
- name: GITHUB_ACCESS_TOKEN
Expand Down
16 changes: 16 additions & 0 deletions api/internal/crawl/config/elastic/esbackup.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
# Creating `esbackup/kustomize-backbup` will create the `kustomize-backup` snapshot repository.
# Deleting `esbackup/kustomize-backbup` will delete the `kustomize-backup` snapshot repository and all the snapshots in the repository.
# Deleting `esbackup/kustomize-backbup` will NOT delete essnapshot and esrestore objects.
apiVersion: elasticsearch.cloud.google.com/v1alpha1
kind: ESBackup
metadata:
name: kustomize-backup
spec:
storage:
gcs:
bucket: kustomize-backup
path: kustomize
secret:
name: kustomizesa
escluster:
name: esbasic
7 changes: 7 additions & 0 deletions api/internal/crawl/config/elastic/escluster.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,13 @@ spec:
- repository-gcs
- ingest-user-agent
- ingest-geoip
# To set `gcpserviceaccount`,
# First, create and download a GCP service account into a json file, named `sakey.json` following the instruction:
# https://www.elastic.co/guide/en/elasticsearch/plugins/6.5/repository-gcs-usage.html#repository-gcs-using-service-account
# Second, create a secret for the service account using the following command:
# $ kubectl create secret generic kustomizesa --from-file=./sakey.json
gcpserviceaccount:
name: kustomizesa
config:
env:
example: test
Expand Down
16 changes: 16 additions & 0 deletions api/internal/crawl/config/elastic/esrestore.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
# Creating `esrestore/kustomize-restore` will restore the `kuostmize` index in the `kustomize-snapshot` snapshot to a new index named `kusotmize-restore`.
# Deleting `esrestore/kustomize-restore` will not delete the restored index.
# Deleting `esrestore/kustomize-restore` should happen before deleting `essnapshot/kustomize-snapshot`.
apiVersion: elasticsearch.cloud.google.com/v1alpha1
kind: ESRestore
metadata:
name: kustomize-restore
spec:
include_global_state: true
ignore_unavailable: true
rename_pattern: kustomize
rename_replacement: kustomize-restore
essnapshot:
name: kustomize-snapshot
escluster:
name: esbasic
15 changes: 15 additions & 0 deletions api/internal/crawl/config/elastic/essnaptshot.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
# Creating `essnapshot/kustomize-snapshot` will create a snapshot named `kustomize-snapshot` in the `kustomize-backup` snapshot repository.
# Deleting `essnapshot/kustomize-snapshot` will delete the snapshot.
# Deleting `essnapshot/kustomize-snapshot` should happen before deleting `esbackup/kustomize-backup`.
apiVersion: elasticsearch.cloud.google.com/v1alpha1
kind: ESSnapshot
metadata:
name: kustomize-snapshot
spec:
# indices are optional. If not specified all indices are selected.
indices:
- kustomize
include_global_state: true
ignore_unavailable: true
esbackup:
name: kustomize-backup
Loading

0 comments on commit c683e6a

Please sign in to comment.