diff --git a/api/internal/crawl/cmd/crawler/crawler.go b/api/internal/crawl/cmd/crawler/crawler.go index e534780d2e..d87c479962 100644 --- a/api/internal/crawl/cmd/crawler/crawler.go +++ b/api/internal/crawl/cmd/crawler/crawler.go @@ -167,7 +167,7 @@ func main() { it := idx.IterateQuery(query, 10000, 60*time.Second) for it.Next() { for _, hit := range it.Value().Hits.Hits { - seedDocs = append(seedDocs, hit.Document.Copy()) + seedDocs = append(seedDocs, hit.Document.Document.Copy()) } } if err := it.Err(); err != nil { diff --git a/api/internal/crawl/cmd/kustomize_stats/main.go b/api/internal/crawl/cmd/kustomize_stats/main.go index 1034050799..b2c3d5a04f 100644 --- a/api/internal/crawl/cmd/kustomize_stats/main.go +++ b/api/internal/crawl/cmd/kustomize_stats/main.go @@ -2,16 +2,13 @@ package main import ( "context" + "crypto/sha256" "flag" "fmt" "log" - "net/http" - "os" "sort" "time" - "sigs.k8s.io/kustomize/api/internal/crawl/crawler/github" - "sigs.k8s.io/kustomize/api/internal/crawl/doc" "sigs.k8s.io/kustomize/api/internal/crawl/index" @@ -34,9 +31,9 @@ func iterateArr(arr []string, countMap map[string]int) { } -// SortMapKeyByValue takes a map as its input, sorts its keys according to their values +// SortMapKeyByValueInt takes a map as its input, sorts its keys according to their values // in the map, and outputs the sorted keys as a slice. -func SortMapKeyByValue(m map[string]int) []string { +func SortMapKeyByValueInt(m map[string]int) []string { keys := make([]string, 0, len(m)) for key := range m { keys = append(keys, key) @@ -46,56 +43,58 @@ func SortMapKeyByValue(m map[string]int) []string { return keys } -func GeneratorOrTransformerStats(ctx context.Context, - docs []*doc.Document, isGenerator bool, idx *index.KustomizeIndex) { +// SortMapKeyByValue takes a map as its input, sorts its keys according to their values +// in the map, and outputs the sorted keys as a slice. +func SortMapKeyByValueLen(m map[string][]string) []string { + keys := make([]string, 0, len(m)) + for key := range m { + keys = append(keys, key) + } + // sort keys according to their values in the map m + sort.Slice(keys, func(i, j int) bool { return len(m[keys[i]]) > len(m[keys[j]]) }) + return keys +} - fieldName := "generators" - if !isGenerator { - fieldName = "transformers" +func GeneratorOrTransformerStats(docs []*doc.KustomizationDocument) { + n := len(docs) + if n == 0 { + return } - // allReferredDocs includes all the documents referred in the field - allReferredDocs := doc.NewUniqueDocuments() + fileType := docs[0].FileType + fmt.Printf("There are totally %d %s files.\n", n, fileType) + + GitRepositorySummary(docs, fileType) - // docUsingGeneratorCount counts the number of the kustomization files using generators or transformers - docCount := 0 + // key of kindToUrls: a string in the KustomizationDocument.Kinds field + // value of kindToUrls: a slice of string urls defining a given kind. + kindToUrls := make(map[string][]string) - // collect all the documents referred in the field for _, d := range docs { - kdoc := doc.KustomizationDocument{ - Document: *d, - } - referredDocs, err := kdoc.GetResources(false, !isGenerator, isGenerator) - if err != nil { - log.Printf("failed to parse the %s field of the Document (%s): %v", - fieldName, d.Path(), err) + url := fmt.Sprintf("%s/blob/%s/%s", d.RepositoryURL, d.DefaultBranch, d.FilePath) + for _, kind := range d.Kinds { + if _, ok := kindToUrls[kind]; !ok { + kindToUrls[kind] = []string{url} + } else { + kindToUrls[kind] = append(kindToUrls[kind], url) + } } - if len(referredDocs) > 0 { - docCount++ - allReferredDocs.AddDocuments(referredDocs) + } + fmt.Printf("There are totally %d kinds of %s\n", len(kindToUrls), fileType) + sortedKeys := SortMapKeyByValueLen(kindToUrls) + for _, k := range sortedKeys { + sort.Strings(kindToUrls[k]) + fmt.Printf("%s kind %s appears %d times\n", fileType, k, len(kindToUrls[k])) + for _, url := range kindToUrls[k] { + fmt.Printf("%s\n", url) } } - - fileCount, dirCount, fileTypeDocs, dirTypeDocs := DocumentTypeSummary(ctx, allReferredDocs.Documents()) - - // check whether any of the files are not in the index - nonExistFileCount := ExistInIndex(idx, fileTypeDocs, fieldName + " file ") - // check whether any of the dirs are not in the index - nonExistDirCount := ExistInIndex(idx, dirTypeDocs, fieldName + " dir ") - - GitRepositorySummary(fileTypeDocs, fieldName + " files") - GitRepositorySummary(dirTypeDocs, fieldName + " dirs") - - fmt.Printf("%d kustomization files use %s: %d %s are files and %d %s are dirs.\n", - docCount, fieldName, fileCount, fieldName, dirCount, fieldName) - fmt.Printf("%d %s files do not exist in the index\n", nonExistFileCount, fieldName) - fmt.Printf("%d %s dirs do not exist in the index\n", nonExistDirCount, fieldName) } // GitRepositorySummary counts the distribution of docs: // 1) how many git repositories are these docs from? // 2) how many docs are from each git repository? -func GitRepositorySummary(docs []*doc.Document, msgPrefix string) { +func GitRepositorySummary(docs []*doc.KustomizationDocument, fileType string) { m := make(map[string]int) for _, d := range docs { if _, ok := m[d.RepositoryURL]; ok { @@ -104,65 +103,16 @@ func GitRepositorySummary(docs []*doc.Document, msgPrefix string) { m[d.RepositoryURL] = 1 } } - sortedKeys := SortMapKeyByValue(m) + sortedKeys := SortMapKeyByValueInt(m) + topN := 10 + i := 0 for _, k := range sortedKeys { - fmt.Printf("%d %s are from %s\n", m[k], msgPrefix, k) - } -} - -// ExistInIndex goes through each Document in docs, and check whether it is in the index or not. -// It returns the number of documents which does not exist in the index. -func ExistInIndex(idx *index.KustomizeIndex, docs []*doc.Document, msgPrefix string) int { - nonExistCount := 0 - for _, d := range docs { - exists, err := idx.Exists(d.ID()) - if err != nil { - log.Println(err) - } - if !exists { - log.Printf("%s (%s) does not exist in the index", msgPrefix, d.Path()) - nonExistCount++ - } - } - return nonExistCount -} - -// DocumentTypeSummary goes through each doc in docs, and determines whether it is a file or dir. -func DocumentTypeSummary(ctx context.Context, docs []*doc.Document) ( - fileCount, dirCount int, files, dirs []*doc.Document) { - githubToken := os.Getenv(githubAccessTokenVar) - if githubToken == "" { - log.Fatalf("Must set the variable '%s' to make github requests.\n", - githubAccessTokenVar) - } - ghCrawler := github.NewCrawler(githubToken, retryCount, &http.Client{}, github.QueryWith()) - - for _, d := range docs { - oldFilePath := d.FilePath - if err := ghCrawler.FetchDocument(ctx, d); err != nil { - log.Printf("FetchDocument failed on %s: %v", d.Path(), err) - continue - } - - if d.FilePath == oldFilePath { - fileCount++ - files = append(files, d) - } else { - dirCount++ - dirs = append(dirs, d) - } - } - return fileCount, dirCount, files, dirs -} - -// ExistInSlice checks where target exits in items. -func ExistInSlice(items []string, target string) bool { - for _, item := range items { - if item == target { - return true + if i >= topN { + break } + fmt.Printf("%d %s are from %s\n", m[k], fileType, k) + i++ } - return false } func main() { @@ -204,17 +154,26 @@ If you only want to list the 10 most popular features, set the flag to 10.`) // ids tracks the unique IDs of the documents in the index ids := make(map[string]struct{}) - // generatorDocs includes all the docs using generators - generatorDocs := make([]*doc.Document, 0) + // generatorFiles include all the non-kustomization files whose FileType is generator + generatorFiles := make([]*doc.KustomizationDocument, 0) - // transformersDocs includes all the docs using transformers - transformersDocs := make([]*doc.Document, 0) + // transformersFiles include all the non-kustomization files whose FileType is transformer + transformersFiles := make([]*doc.KustomizationDocument, 0) + + checksums := make(map[string]int) // get all the documents in the index query := []byte(`{ "query":{ "match_all":{} } }`) it := idx.IterateQuery(query, 10000, 60*time.Second) for it.Next() { for _, hit := range it.Value().Hits.Hits { + sum := fmt.Sprintf("%x", sha256.Sum256([]byte(hit.Document.DocumentData))) + if _, ok := checksums[sum]; ok { + checksums[sum]++ + } else { + checksums[sum] = 1 + } + // check whether there is any duplicate IDs in the index if _, ok := ids[hit.ID]; !ok { ids[hit.ID] = struct{}{} @@ -229,11 +188,13 @@ If you only want to list the 10 most popular features, set the flag to 10.`) if doc.IsKustomizationFile(hit.Document.FilePath) { kustomizationFilecount++ iterateArr(hit.Document.Identifiers, kustomizeIdentifiersMap) - if ExistInSlice(hit.Document.Identifiers, "generators") { - generatorDocs = append(generatorDocs, hit.Document.Copy()) - } - if ExistInSlice(hit.Document.Identifiers, "transformers") { - transformersDocs = append(transformersDocs, hit.Document.Copy()) + + } else { + switch hit.Document.FileType { + case "generator": + generatorFiles = append(generatorFiles, hit.Document.Copy()) + case "transformer": + transformersFiles = append(transformersFiles, hit.Document.Copy()) } } } @@ -243,9 +204,9 @@ If you only want to list the 10 most popular features, set the flag to 10.`) log.Fatalf("Error iterating: %v\n", err) } - sortedKindsMapKeys := SortMapKeyByValue(kindsMap) - sortedIdentifiersMapKeys := SortMapKeyByValue(identifiersMap) - sortedKustomizeIdentifiersMapKeys := SortMapKeyByValue(kustomizeIdentifiersMap) + sortedKindsMapKeys := SortMapKeyByValueInt(kindsMap) + sortedIdentifiersMapKeys := SortMapKeyByValueInt(identifiersMap) + sortedKustomizeIdentifiersMapKeys := SortMapKeyByValueInt(kustomizeIdentifiersMap) fmt.Printf(`The count of unique document IDs in the kustomize index: %d There are %d documents in the kustomize index. @@ -280,6 +241,14 @@ There are %d documents in the kustomize index. } } - GeneratorOrTransformerStats(ctx, generatorDocs, true, idx) - GeneratorOrTransformerStats(ctx, transformersDocs, false, idx) + GeneratorOrTransformerStats(generatorFiles) + GeneratorOrTransformerStats(transformersFiles) + + fmt.Printf("There are total %d checksums of document contents\n", len(checksums)) + sortedChecksums := SortMapKeyByValueInt(checksums) + sortedChecksums = sortedChecksums[:20] + fmt.Printf("The top 20 checksums are:\n") + for _, key := range sortedChecksums { + fmt.Printf("checksum %s apprears %d\n", key, checksums[key]) + } } diff --git a/api/internal/crawl/config/crawler/cronjob/cronjob.yaml b/api/internal/crawl/config/crawler/cronjob/cronjob.yaml index ab333c782c..2a94332523 100644 --- a/api/internal/crawl/config/crawler/cronjob/cronjob.yaml +++ b/api/internal/crawl/config/crawler/cronjob/cronjob.yaml @@ -1,9 +1,10 @@ apiVersion: batch/v1beta1 kind: CronJob metadata: - name: crawler + name: crawler-cronjob spec: - schedule: "5 0 * * */1" + # run the cronjob at 00:00 every 7 days + schedule: "0 0 */7 * *" jobTemplate: spec: template: @@ -11,7 +12,9 @@ spec: restartPolicy: OnFailure containers: - name: crawler - image: gcr.io/kustomize-search/crawler:latest + image: gcr.io/haiyanmeng-gke-dev/crawler:v1 + command: ["/crawler"] + args: ["--mode=index+github", "--github-repo=kubernetes-sigs/kustomize", "--index=kustomize"] imagePullPolicy: Always env: - name: GITHUB_ACCESS_TOKEN diff --git a/api/internal/crawl/config/elastic/esbackup.yaml b/api/internal/crawl/config/elastic/esbackup.yaml new file mode 100644 index 0000000000..c509ba2062 --- /dev/null +++ b/api/internal/crawl/config/elastic/esbackup.yaml @@ -0,0 +1,16 @@ +# Creating `esbackup/kustomize-backbup` will create the `kustomize-backup` snapshot repository. +# Deleting `esbackup/kustomize-backbup` will delete the `kustomize-backup` snapshot repository and all the snapshots in the repository. +# Deleting `esbackup/kustomize-backbup` will NOT delete essnapshot and esrestore objects. +apiVersion: elasticsearch.cloud.google.com/v1alpha1 +kind: ESBackup +metadata: + name: kustomize-backup +spec: + storage: + gcs: + bucket: kustomize-backup + path: kustomize + secret: + name: kustomizesa + escluster: + name: esbasic diff --git a/api/internal/crawl/config/elastic/escluster.yaml b/api/internal/crawl/config/elastic/escluster.yaml index 47e1f5458e..4515cee47a 100644 --- a/api/internal/crawl/config/elastic/escluster.yaml +++ b/api/internal/crawl/config/elastic/escluster.yaml @@ -8,6 +8,13 @@ spec: - repository-gcs - ingest-user-agent - ingest-geoip + # To set `gcpserviceaccount`, + # First, create and download a GCP service account into a json file, named `sakey.json` following the instruction: + # https://www.elastic.co/guide/en/elasticsearch/plugins/6.5/repository-gcs-usage.html#repository-gcs-using-service-account + # Second, create a secret for the service account using the following command: + # $ kubectl create secret generic kustomizesa --from-file=./sakey.json + gcpserviceaccount: + name: kustomizesa config: env: example: test diff --git a/api/internal/crawl/config/elastic/esrestore.yaml b/api/internal/crawl/config/elastic/esrestore.yaml new file mode 100644 index 0000000000..0d77bed932 --- /dev/null +++ b/api/internal/crawl/config/elastic/esrestore.yaml @@ -0,0 +1,16 @@ +# Creating `esrestore/kustomize-restore` will restore the `kuostmize` index in the `kustomize-snapshot` snapshot to a new index named `kusotmize-restore`. +# Deleting `esrestore/kustomize-restore` will not delete the restored index. +# Deleting `esrestore/kustomize-restore` should happen before deleting `essnapshot/kustomize-snapshot`. +apiVersion: elasticsearch.cloud.google.com/v1alpha1 +kind: ESRestore +metadata: + name: kustomize-restore +spec: + include_global_state: true + ignore_unavailable: true + rename_pattern: kustomize + rename_replacement: kustomize-restore + essnapshot: + name: kustomize-snapshot + escluster: + name: esbasic diff --git a/api/internal/crawl/config/elastic/essnaptshot.yaml b/api/internal/crawl/config/elastic/essnaptshot.yaml new file mode 100644 index 0000000000..2a94689608 --- /dev/null +++ b/api/internal/crawl/config/elastic/essnaptshot.yaml @@ -0,0 +1,15 @@ +# Creating `essnapshot/kustomize-snapshot` will create a snapshot named `kustomize-snapshot` in the `kustomize-backup` snapshot repository. +# Deleting `essnapshot/kustomize-snapshot` will delete the snapshot. +# Deleting `essnapshot/kustomize-snapshot` should happen before deleting `esbackup/kustomize-backup`. +apiVersion: elasticsearch.cloud.google.com/v1alpha1 +kind: ESSnapshot +metadata: + name: kustomize-snapshot +spec: + # indices are optional. If not specified all indices are selected. + indices: + - kustomize + include_global_state: true + ignore_unavailable: true + esbackup: + name: kustomize-backup diff --git a/api/internal/crawl/crawler/github/crawler.go b/api/internal/crawl/crawler/github/crawler.go index 21cfd6f817..a749c67848 100644 --- a/api/internal/crawl/crawler/github/crawler.go +++ b/api/internal/crawl/crawler/github/crawler.go @@ -80,6 +80,36 @@ func (gc githubCrawler) DefaultBranch(repo string) string { func (gc githubCrawler) Crawl(ctx context.Context, output chan<- crawler.CrawledDocument, seen utils.SeenMap) error { + ranges := []RangeWithin{ + RangeWithin{ + start: uint64(0), + end: githubMaxFileSize, + }, + } + + errs := make(multiError, 0) + for len(ranges) > 0 { + tailRange := ranges[len(ranges) - 1] + ranges = ranges[:(len(ranges) - 1)] + reProcessQueryRanges, err := gc.CrawlSingleRange(ctx, output, seen, tailRange.start, tailRange.end) + if err != nil { + errs = append(errs, err) + } + ranges = append(ranges, reProcessQueryRanges...) + } + + if len(errs) > 0 { + return errs + } + return nil +} + +func (gc githubCrawler) CrawlSingleRange(ctx context.Context, + output chan<- crawler.CrawledDocument, seen utils.SeenMap, + lowerBound, upperBound uint64) ([]RangeWithin, error) { + + log.Printf("CrawlSingleRange [%d, %d]", lowerBound, upperBound) + noETagClient := GhClient{ RequestConfig: gc.client.RequestConfig, client: &http.Client{Timeout: gc.client.client.Timeout}, @@ -87,13 +117,16 @@ func (gc githubCrawler) Crawl(ctx context.Context, accessToken: gc.client.accessToken, } + var reProcessQueryRanges []RangeWithin + var ranges []string var err error // Since Github returns a max of 1000 results per query, we can use // multiple queries that split the search space into chunks of at most // 1000 files to get all of the data. for i := 0; i < 5; i++ { - ranges, err = FindRangesForRepoSearch(newCache(noETagClient, gc.query)) + ranges, err = FindRangesForRepoSearch(newCache(noETagClient, gc.query), + lowerBound, upperBound) if err == nil { logger.Printf("FindRangesForRepoSearch succeeded after %d retries", i) break @@ -102,7 +135,7 @@ func (gc githubCrawler) Crawl(ctx context.Context, } } if err != nil { - return fmt.Errorf("could not split %v into ranges, %v\n", + return reProcessQueryRanges, fmt.Errorf("could not split %v into ranges, %v\n", gc.query, err) } @@ -112,20 +145,23 @@ func (gc githubCrawler) Crawl(ctx context.Context, errs := make(multiError, 0) queryResult := RangeQueryResult{} for _, query := range ranges { - rangeResult, err := processQuery(ctx, gc.client, query, output, seen, gc.branchMap) + reProcessQuery, rangeResult, err := processQuery(ctx, gc.client, query, output, seen, gc.branchMap) if err != nil { errs = append(errs, err) } queryResult.Add(rangeResult) + if reProcessQuery { + reProcessQueryRanges = append(reProcessQueryRanges, RangeSizes(query)) + } } logger.Printf("Summary of Crawl: %s", queryResult.String()) if len(errs) > 0 { - return errs + return reProcessQueryRanges, errs } - return nil + return reProcessQueryRanges, nil } // FetchDocument first tries to fetch the document with d.FilePath. If it fails, @@ -225,7 +261,7 @@ func (r *RangeQueryResult) String() string { // documents from the crawl to the datastore/index. func processQuery(ctx context.Context, gcl GhClient, query string, output chan<- crawler.CrawledDocument, seen utils.SeenMap, - branchMap map[string]string) (RangeQueryResult, error) { + branchMap map[string]string) (bool, RangeQueryResult, error) { queryPages := make(chan GhResponseInfo) @@ -241,6 +277,8 @@ func processQuery(ctx context.Context, gcl GhClient, query string, close(queryPages) }() + reProcessQuery := false + errs := make(multiError, 0) result := RangeQueryResult{} pageID := 1 @@ -271,11 +309,15 @@ func processQuery(ctx context.Context, gcl GhClient, query string, result.Add(pageResult) pageID++ + + if page.Parsed.TotalCount > githubMaxResultsPerQuery { + reProcessQuery = true + } } logger.Printf("Summary of processQuery: %s", result.String()) - return result, errs + return reProcessQuery, result, errs } func kustomizationResultAdapter(gcl GhClient, k GhFileSpec, seen utils.SeenMap, @@ -337,7 +379,7 @@ func (gcl GhClient) ForwardPaginatedQuery(ctx context.Context, query string, output chan<- GhResponseInfo) error { logger.Println("querying: ", query) - response := gcl.parseGithubResponse(query) + response := gcl.parseGithubResponseWithRetry(query) if response.Error != nil { return response.Error @@ -350,7 +392,7 @@ func (gcl GhClient) ForwardPaginatedQuery(ctx context.Context, query string, case <-ctx.Done(): return nil default: - response = gcl.parseGithubResponse(response.NextURL) + response = gcl.parseGithubResponseWithRetry(response.NextURL) if response.Error != nil { return response.Error } @@ -545,6 +587,8 @@ type githubResponse struct { // This is the number of files that match the query. TotalCount uint64 `json:"total_count,omitempty"` + IncompleteResults bool `json:"incomplete_results,omitempty"` + // Github representation of a file. Items []GhFileSpec `json:"items,omitempty"` } @@ -587,6 +631,17 @@ func parseGithubLinkFormat(links string) (string, string) { return next, last } +func (gcl GhClient) parseGithubResponseWithRetry(getRequest string) GhResponseInfo { + resp := gcl.parseGithubResponse(getRequest) + retries := 0 + for resp.Parsed.IncompleteResults { + resp = gcl.parseGithubResponse(getRequest) + retries++ + } + log.Printf("The result of query(%s) is complete after %d retries", getRequest, retries) + return resp +} + func (gcl GhClient) parseGithubResponse(getRequest string) GhResponseInfo { resp, err := gcl.SearchGithubAPI(getRequest) requestInfo := GhResponseInfo{ diff --git a/api/internal/crawl/crawler/github/split_search_ranges.go b/api/internal/crawl/crawler/github/split_search_ranges.go index 8d9037478d..a852330c82 100644 --- a/api/internal/crawl/crawler/github/split_search_ranges.go +++ b/api/internal/crawl/crawler/github/split_search_ranges.go @@ -100,6 +100,8 @@ package github import ( "fmt" "math/bits" + "strconv" + "strings" ) // Files cannot be more than 2^19 bytes, according to @@ -112,7 +114,7 @@ const ( // Interface instead of struct for testing purposes. // Not expecting to have multiple implementations. type cachedSearch interface { - CountResults(uint64) (uint64, error) + CountResults(uint64, uint64) (uint64, error) RequestString(filesize rangeFormatter) string } @@ -161,16 +163,16 @@ func newCache(client GhClient, query Query) githubCachedSearch { } } -func (c githubCachedSearch) CountResults(upperBound uint64) (uint64, error) { +func (c githubCachedSearch) CountResults(lowerBound, upperBound uint64) (uint64, error) { count, cached := c.cache[upperBound] if cached { return count, nil } - sizeRange := RangeWithin{0, upperBound} + sizeRange := RangeWithin{lowerBound, upperBound} rangeRequest := c.RequestString(sizeRange) - result := c.gcl.parseGithubResponse(rangeRequest) + result := c.gcl.parseGithubResponseWithRetry(rangeRequest) if result.Error != nil { return count, result.Error } @@ -204,7 +206,7 @@ func (c githubCachedSearch) CountResults(upperBound uint64) (uint64, error) { "Retrying query... current lower bound: %d, got: %d\n", c.cache[prev], result.Parsed.TotalCount) - result = c.gcl.parseGithubResponse(rangeRequest) + result = c.gcl.parseGithubResponseWithRetry(rangeRequest) if result.Error != nil { return count, result.Error } @@ -219,8 +221,8 @@ func (c githubCachedSearch) CountResults(upperBound uint64) (uint64, error) { } count = result.Parsed.TotalCount - logger.Printf("Caching new query %s, with count %d\n", - sizeRange.RangeString(), count) + logger.Printf("Caching new query %s, with count %d (incomplete_results: %v)\n", + sizeRange.RangeString(), count, result.Parsed.IncompleteResults) c.cache[upperBound] = count return count, nil } @@ -238,8 +240,8 @@ func (c githubCachedSearch) RequestString(filesize rangeFormatter) string { // This would mean that the search as it is could not find all files. If queries // are sorted by last indexed, and retrieved on regular intervals, it should be // sufficient to get most if not all documents. -func FindRangesForRepoSearch(cache cachedSearch) ([]string, error) { - totalFiles, err := cache.CountResults(githubMaxFileSize) +func FindRangesForRepoSearch(cache cachedSearch, lowerBound, upperBound uint64) ([]string, error) { + totalFiles, err := cache.CountResults(lowerBound, upperBound) if err != nil { return nil, err } @@ -247,7 +249,7 @@ func FindRangesForRepoSearch(cache cachedSearch) ([]string, error) { if githubMaxResultsPerQuery >= totalFiles { return []string{ - cache.RequestString(RangeWithin{0, githubMaxFileSize}), + cache.RequestString(RangeWithin{lowerBound, upperBound}), }, nil } @@ -275,6 +277,7 @@ func FindRangesForRepoSearch(cache cachedSearch) ([]string, error) { // range. filesAccessible := uint64(0) sizes := make([]uint64, 0) + sizes = append(sizes, lowerBound) for filesAccessible < totalFiles { target := filesAccessible + githubMaxResultsPerQuery if target >= totalFiles { @@ -284,22 +287,22 @@ func FindRangesForRepoSearch(cache cachedSearch) ([]string, error) { logger.Printf("%d accessible files, next target = %d\n", filesAccessible, target) - cur, err := lowerBoundFileCount(cache, target) + size, err := FindFileSize(cache, target, lowerBound, upperBound) if err != nil { return nil, err } // If there are more than 1000 files in the next bucket, we must // advance anyway and lose out on some files :(. - if l := len(sizes); l > 0 && sizes[l-1] == cur { - cur++ + if l := len(sizes); l > 0 && sizes[l-1] == size { + size++ } - nextAccessible, err := cache.CountResults(cur) + nextAccessible, err := cache.CountResults(lowerBound, size) if err != nil { return nil, fmt.Errorf( "cache should be populated at %d already, got %v", - cur, err) + size, err) } if nextAccessible < filesAccessible { return nil, fmt.Errorf( @@ -309,31 +312,31 @@ func FindRangesForRepoSearch(cache cachedSearch) ([]string, error) { filesAccessible = nextAccessible if nextAccessible < totalFiles { - sizes = append(sizes, cur) + sizes = append(sizes, size) } } - + sizes = append(sizes, upperBound) return formatFilesizeRanges(cache, sizes), nil } -// lowerBoundFileCount finds the filesize range from [0, return value] that has +// FindFileSize finds the filesize range from [lowerBound, return value] that has // the largest file count that is smaller than or equal to // githubMaxResultsPerQuery. It is important to note that this returned value // could already be in a previous range if the next file size has more than 1000 // results. It is left to the caller to handle this bit of logic and guarantee // forward progession in this case. -func lowerBoundFileCount( - cache cachedSearch, targetFileCount uint64) (uint64, error) { +func FindFileSize( + cache cachedSearch, targetFileCount, lowerBound, upperBound uint64) (uint64, error) { // Binary search for file sizes that make up the next <=1000 element // chunk. - cur := uint64(0) - increase := githubMaxFileSize / 2 + cur := lowerBound + increase := (upperBound - lowerBound) / 2 for increase > 0 { mid := cur + increase - count, err := cache.CountResults(mid) + count, err := cache.CountResults(lowerBound, mid) if err != nil { return count, err } @@ -353,26 +356,24 @@ func lowerBoundFileCount( } func formatFilesizeRanges(cache cachedSearch, sizes []uint64) []string { - ranges := make([]string, 0, len(sizes)+1) - - if len(sizes) > 0 { - ranges = append(ranges, cache.RequestString( - RangeLessThan{sizes[0] + 1}, - )) + n := len(sizes) + if n < 2 { + return []string{} } - for i := 0; i < len(sizes)-1; i += 1 { - ranges = append(ranges, cache.RequestString( - RangeWithin{sizes[i] + 1, sizes[i+1]}, - )) - - if i != len(sizes)-2 { - continue - } - ranges = append(ranges, cache.RequestString( - RangeGreaterThan{sizes[i+1]}, - )) + ranges := make([]string, 0, n-1) + ranges = append(ranges, cache.RequestString(RangeWithin{sizes[0], sizes[1]})) + for i := 1; i < n-1; i++ { + ranges = append(ranges, cache.RequestString(RangeWithin{sizes[i] + 1, sizes[i+1]})) } - return ranges } + +func RangeSizes(s string) RangeWithin { + start := strings.Index(s, "+size:") + len("+size:") + end := strings.Index(s, "&") + ranges := strings.Split(s[start:end], "..") + lowerBound, _ := strconv.ParseUint(ranges[0], 10, 64) + upperBound, _ := strconv.ParseUint(ranges[1], 10, 64) + return RangeWithin{lowerBound, upperBound} +} diff --git a/api/internal/crawl/crawler/github/split_search_ranges_test.go b/api/internal/crawl/crawler/github/split_search_ranges_test.go index ad332388de..788ef17bc9 100644 --- a/api/internal/crawl/crawler/github/split_search_ranges_test.go +++ b/api/internal/crawl/crawler/github/split_search_ranges_test.go @@ -11,7 +11,7 @@ type testCachedSearch struct { cache map[uint64]uint64 } -func (c testCachedSearch) CountResults(upperBound uint64) (uint64, error) { +func (c testCachedSearch) CountResults(lowerBound, upperBound uint64) (uint64, error) { log.Printf("CountResults(%05x)\n", upperBound) count, ok := c.cache[upperBound] if !ok { @@ -73,19 +73,29 @@ func TestRangeSplitting(t *testing.T) { }, } - requests, err := FindRangesForRepoSearch(cache) + requests, err := FindRangesForRepoSearch(cache, 0, 524288) if err != nil { t.Errorf("Error while finding ranges: %v", err) } expected := []string{ - "<107", // cache.RequestString(RangeLessThan{0x6b}), - "107..128", // cache.RequestString(RangeWithin{0x6b, 0x80}), - "129..256", // cache.RequestString(RangeWithin{0x81, 0x100}), - "257..4095", // cache.RequestString(RangeWithin{0x101, 0xfff}), - ">4095", // cache.RequestString(RangeGreaterThan{0xfff}), + "0..106", // cache.RequestString(RangeWithin{0x00, 0x6a}), + "107..128", // cache.RequestString(RangeWithin{0x6b, 0x80}), + "129..256", // cache.RequestString(RangeWithin{0x81, 0x100}), + "257..4095", // cache.RequestString(RangeWithin{0x101, 0xfff}), + "4096..524288", // cache.RequestString(RangeWithin{0x1000, 0x80000}), } if !reflect.DeepEqual(requests, expected) { t.Errorf("Expected requests (%v) to equal (%v)", requests, expected) } } + +func TestRangeSizes(t *testing.T) { + s := "https://api.github.com/search/code?q=filename:kustomization.yaml+filename:kustomization.yml" + + "+filename:kustomization+size:2365..10000&order=desc&per_page=100&sort=indexed" + returnedResult := RangeSizes(s) + expectedResult := RangeWithin{uint64(2365), uint64(10000)} + if !reflect.DeepEqual(returnedResult, expectedResult) { + t.Errorf("RangeSizes expected (%v), got (%v)",expectedResult, returnedResult) + } +} diff --git a/api/internal/crawl/doc/doc.go b/api/internal/crawl/doc/doc.go index 0e5965ac1b..d37fd19cac 100644 --- a/api/internal/crawl/doc/doc.go +++ b/api/internal/crawl/doc/doc.go @@ -46,6 +46,15 @@ type KustomizationDocument struct { type set map[string]struct{} +func (doc *KustomizationDocument) Copy() *KustomizationDocument { + return &KustomizationDocument{ + Document: *(doc.Document.Copy()), + Kinds: doc.Kinds, + Identifiers: doc.Identifiers, + Values: doc.Values, + } +} + func (doc *KustomizationDocument) String() string { return fmt.Sprintf("%s %s %s %v %v %v len(identifiers):%v len(values):%v", doc.RepositoryURL, doc.FilePath, doc.DefaultBranch, doc.CreationTime, diff --git a/api/internal/crawl/search_cmds/generator.md b/api/internal/crawl/search_cmds/generator.md new file mode 100644 index 0000000000..0b324600f8 --- /dev/null +++ b/api/internal/crawl/search_cmds/generator.md @@ -0,0 +1,29 @@ +Find all the generator files whose `kinds` field includes `ChartRenderer`, and +only output certain fields of each document: +``` +curl -X GET "${ElasticSearchURL}:9200/${INDEXNAME}/_search?pretty" -H 'Content-Type: application/json' -d' +{ + "size": 200, + "_source": { + "includes": ["kinds", "repositoryUrl", "defaultBranch", "filePath"] + }, + "query": { + "bool": { + "filter": [ + { "regexp": { "fileType": "generator" }} + ], + "must_not": { + "regexp": { "filePath": "(.*/)?kustomization((.yaml)?|(.yml)?)(/)*" } + }, + "must": { + "match" : { + "kinds" : { + "query" : "ChartRenderer" + } + } + } + } + } +} +' +``` \ No newline at end of file diff --git a/api/internal/crawl/search_cmds/snapshot.md b/api/internal/crawl/search_cmds/snapshot.md new file mode 100644 index 0000000000..fd7cc53019 --- /dev/null +++ b/api/internal/crawl/search_cmds/snapshot.md @@ -0,0 +1,29 @@ +Retrieve information about all registered snapshot repositories: +``` +curl -X GET "${ElasticSearchURL}:9200/_snapshot?pretty" +``` + +Retrieve information about a given snapshot repository, `kustomize-backup`: +``` +curl -X GET "${ElasticSearchURL}:9200/_snapshot/kustomize-backup?pretty" +``` + +Verify a snapshot repository, `kustomize-backup`, manually: +``` +curl -X POST "${ElasticSearchURL}:9200/_snapshot/kustomize-backup/_verify?pretty" +``` + +List all the snapshots in a given snapshot repository: +``` +curl -X GET "${ElasticSearchURL}:9200/_cat/snapshots/kustomize-backup?v&s=id&pretty" +``` + +Retrieve a summary information about a given snapshot: +``` +curl -X GET "${ElasticSearchURL}:9200/_snapshot/kustomize-backup/kustomize-snapshot?pretty" +``` + +Retrieve a detailed information about a given snapshot: +``` +curl -X GET "${ElasticSearchURL}:9200/_snapshot/kustomize-backup/kustomize-snapshot/_status?pretty" +``` diff --git a/api/internal/crawl/search_cmds/transformer.md b/api/internal/crawl/search_cmds/transformer.md new file mode 100644 index 0000000000..0b35aeb6e4 --- /dev/null +++ b/api/internal/crawl/search_cmds/transformer.md @@ -0,0 +1,29 @@ +Find all the trasnformer files whose `kinds` field includes `HelmValues`, and +only output certain fields of each document: +``` +curl -X GET "${ElasticSearchURL}:9200/${INDEXNAME}/_search?pretty" -H 'Content-Type: application/json' -d' +{ + "size": 200, + "_source": { + "includes": ["kinds", "repositoryUrl", "defaultBranch", "filePath"] + }, + "query": { + "bool": { + "filter": [ + { "regexp": { "fileType": "transformer" }} + ], + "must_not": { + "regexp": { "filePath": "(.*/)?kustomization((.yaml)?|(.yml)?)(/)*" } + }, + "must": { + "match" : { + "kinds" : { + "query" : "HelmValues" + } + } + } + } + } +} +' +``` \ No newline at end of file