Skip to content

Commit

Permalink
Merge pull request kubernetes-sigs#2090 from haiyanmeng/retry
Browse files Browse the repository at this point in the history
Add the Document ID pointing to a kuostomization root into cache to avoid crawling it repeatedly
  • Loading branch information
monopole authored Jan 13, 2020
2 parents ecff981 + 569fafb commit 62e5abd
Show file tree
Hide file tree
Showing 2 changed files with 19 additions and 2 deletions.
19 changes: 18 additions & 1 deletion api/internal/crawl/crawler/crawler.go
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,7 @@ func doCrawl(ctx context.Context, docsPtr *CrawlSeed, crawlers []Crawler, conv C
SetCreatedErrCount := 0
convErrCount := 0
deleteDocCount := 0
crawledDocCount := 0

// During the execution of the for loop, more Documents may be added into (*docsPtr).
for len(*docsPtr) > 0 {
Expand All @@ -114,7 +115,11 @@ func doCrawl(ctx context.Context, docsPtr *CrawlSeed, crawlers []Crawler, conv C
// remove the last Document in (*docPtr)
*docsPtr = (*docsPtr)[:(len(*docsPtr) - 1)]

crawledDocCount++
logger.Printf("Crawling doc %d: %s %s", crawledDocCount, tail.RepositoryURL, tail.FilePath)

if _, ok := seen[tail.ID()]; ok {
logger.Printf("this doc has been seen before")
seenDocCount++
continue
}
Expand All @@ -132,7 +137,15 @@ func doCrawl(ctx context.Context, docsPtr *CrawlSeed, crawlers []Crawler, conv C
continue
}

logger.Println("Crawling ", tail.RepositoryURL, tail.FilePath)
// If the Document represents a kustomization root, FetchDcoument will change
// the `filePath` field of the Document by adding `kustomization.yaml` or
// `kustomization.yml` or `kustomization` into the the field.
// Therefore, it is necessary to add the ID of the Document into seen before
// calling FetchDocument. Otherwise, the binary may enter into an infinite loop
// if a kustomization file points to its kustmozation root in its `resources` or
// `bases` field.
seen[tail.ID()] = struct{}{}

if err := match.FetchDocument(ctx, tail); err != nil {
logger.Printf("FetchDocument failed on %s %s: %v",
tail.RepositoryURL, tail.FilePath, err)
Expand Down Expand Up @@ -274,8 +287,12 @@ func CrawlGithub(ctx context.Context, crawlers []Crawler, conv Converter,
wg.Add(1)
go func() {
defer wg.Done()
docCount := 0
for cdoc := range ch {
docCount++
logger.Printf("Processing doc %d found on Github", docCount)
if _, ok := seen[cdoc.ID()]; ok {
logger.Printf("the doc has been seen before")
continue
}
match := findMatch(cdoc.GetDocument(), crawlers)
Expand Down
2 changes: 1 addition & 1 deletion api/internal/crawl/index/elasticsearch.go
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,7 @@ func (idx *index) responseErrorOrNil(info string, res *esapi.Response,

defer res.Body.Close()
if res.IsError() {
return fmt.Errorf("%s: %s", messageStart, res.String())
return fmt.Errorf("%s: %s [%d]", messageStart, res.String(), res.StatusCode)
}

if reader != nil {
Expand Down

0 comments on commit 62e5abd

Please sign in to comment.