Skip to content

Commit

Permalink
Allow code search by filename
Browse files Browse the repository at this point in the history
Signed-off-by: Bruno Sofiato <bruno.sofiato@gmail.com>
  • Loading branch information
bsofiato committed Oct 9, 2024
1 parent 8bee7fc commit 0de61a1
Show file tree
Hide file tree
Showing 38 changed files with 688 additions and 49 deletions.
21 changes: 21 additions & 0 deletions models/fixtures/repo_unit.yml
Original file line number Diff line number Diff line change
Expand Up @@ -712,3 +712,24 @@
type: 3
config: "{\"IgnoreWhitespaceConflicts\":false,\"AllowMerge\":true,\"AllowRebase\":true,\"AllowRebaseMerge\":true,\"AllowSquash\":true}"
created_unix: 946684810

-
id: 108
repo_id: 62
type: 1
config: "{}"
created_unix: 946684810

-
id: 109
repo_id: 62
type: 2
config: "{\"EnableTimetracker\":true,\"AllowOnlyContributorsToTrackTime\":true}"
created_unix: 946684810

-
id: 110
repo_id: 62
type: 3
config: "{\"IgnoreWhitespaceConflicts\":false,\"AllowMerge\":true,\"AllowRebase\":true,\"AllowRebaseMerge\":true,\"AllowSquash\":true}"
created_unix: 946684810
31 changes: 31 additions & 0 deletions models/fixtures/repository.yml
Original file line number Diff line number Diff line change
Expand Up @@ -1768,3 +1768,34 @@
size: 0
is_fsck_enabled: true
close_issues_via_commit_in_any_branch: false

-
id: 62
owner_id: 42
owner_name: org42
lower_name: search-by-path
name: search-by-path
default_branch: master
num_watches: 0
num_stars: 0
num_forks: 0
num_issues: 0
num_closed_issues: 0
num_pulls: 0
num_closed_pulls: 0
num_milestones: 0
num_closed_milestones: 0
num_projects: 0
num_closed_projects: 0
is_private: false
is_empty: false
is_archived: false
is_mirror: false
status: 0
is_fork: false
fork_id: 0
is_template: false
template_id: 0
size: 0
is_fsck_enabled: true
close_issues_via_commit_in_any_branch: false
37 changes: 37 additions & 0 deletions models/fixtures/user.yml
Original file line number Diff line number Diff line change
Expand Up @@ -1517,3 +1517,40 @@
repo_admin_change_team_access: false
theme: ""
keep_activity_private: false

-
id: 42
lower_name: org42
name: org42
full_name: Org42
email: org42@example.com
keep_email_private: false
email_notifications_preference: onmention
passwd: ZogKvWdyEx:password
passwd_hash_algo: dummy
must_change_password: false
login_source: 0
login_name: org42
type: 1
salt: ZogKvWdyEx
max_repo_creation: -1
is_active: false
is_admin: false
is_restricted: false
allow_git_hook: false
allow_import_local: false
allow_create_organization: true
prohibit_login: false
avatar: avatar42
avatar_email: org42@example.com
use_custom_avatar: false
num_followers: 0
num_following: 0
num_stars: 0
num_repos: 1
num_teams: 0
num_members: 0
visibility: 0
repo_admin_change_team_access: false
theme: ""
keep_activity_private: false
6 changes: 3 additions & 3 deletions models/repo/repo_list_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -138,12 +138,12 @@ func getTestCases() []struct {
{
name: "AllPublic/PublicRepositoriesOfUserIncludingCollaborative",
opts: &repo_model.SearchRepoOptions{ListOptions: db.ListOptions{Page: 1, PageSize: 10}, OwnerID: 15, AllPublic: true, Template: optional.Some(false)},
count: 33,
count: 34,
},
{
name: "AllPublic/PublicAndPrivateRepositoriesOfUserIncludingCollaborative",
opts: &repo_model.SearchRepoOptions{ListOptions: db.ListOptions{Page: 1, PageSize: 10}, OwnerID: 15, Private: true, AllPublic: true, AllLimited: true, Template: optional.Some(false)},
count: 38,
count: 39,
},
{
name: "AllPublic/PublicAndPrivateRepositoriesOfUserIncludingCollaborativeByName",
Expand All @@ -158,7 +158,7 @@ func getTestCases() []struct {
{
name: "AllPublic/PublicRepositoriesOfOrganization",
opts: &repo_model.SearchRepoOptions{ListOptions: db.ListOptions{Page: 1, PageSize: 10}, OwnerID: 17, AllPublic: true, Collaborate: optional.Some(false), Template: optional.Some(false)},
count: 33,
count: 34,
},
{
name: "AllTemplates",
Expand Down
5 changes: 4 additions & 1 deletion models/user/user_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,10 @@ func TestSearchUsers(t *testing.T) {
testOrgSuccess(&user_model.SearchUserOptions{OrderBy: "id ASC", ListOptions: db.ListOptions{Page: 4, PageSize: 2}},
[]int64{26, 41})

testOrgSuccess(&user_model.SearchUserOptions{ListOptions: db.ListOptions{Page: 5, PageSize: 2}},
testOrgSuccess(&user_model.SearchUserOptions{OrderBy: "id ASC", ListOptions: db.ListOptions{Page: 5, PageSize: 2}},
[]int64{42})

testOrgSuccess(&user_model.SearchUserOptions{ListOptions: db.ListOptions{Page: 6, PageSize: 2}},
[]int64{})

// test users
Expand Down
44 changes: 37 additions & 7 deletions modules/indexer/code/bleve/bleve.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ import (
"code.gitea.io/gitea/modules/charset"
"code.gitea.io/gitea/modules/git"
"code.gitea.io/gitea/modules/gitrepo"
path_filter "code.gitea.io/gitea/modules/indexer/code/bleve/token/path"
"code.gitea.io/gitea/modules/indexer/code/internal"
indexer_internal "code.gitea.io/gitea/modules/indexer/internal"
inner_bleve "code.gitea.io/gitea/modules/indexer/internal/bleve"
Expand Down Expand Up @@ -53,6 +54,7 @@ type RepoIndexerData struct {
RepoID int64
CommitID string
Content string
Filename string
Language string
UpdatedAt time.Time
}
Expand All @@ -64,8 +66,10 @@ func (d *RepoIndexerData) Type() string {

const (
repoIndexerAnalyzer = "repoIndexerAnalyzer"
filenameIndexerAnalyzer = "filenameIndexerAnalyzer"
filenameIndexerTokenizer = "filenameIndexerTokenizer"
repoIndexerDocType = "repoIndexerDocType"
repoIndexerLatestVersion = 6
repoIndexerLatestVersion = 7
)

// generateBleveIndexMapping generates a bleve index mapping for the repo indexer
Expand All @@ -79,6 +83,11 @@ func generateBleveIndexMapping() (mapping.IndexMapping, error) {
textFieldMapping.IncludeInAll = false
docMapping.AddFieldMappingsAt("Content", textFieldMapping)

fileNamedMapping := bleve.NewTextFieldMapping()
fileNamedMapping.IncludeInAll = false
fileNamedMapping.Analyzer = filenameIndexerAnalyzer
docMapping.AddFieldMappingsAt("Filename", fileNamedMapping)

termFieldMapping := bleve.NewTextFieldMapping()
termFieldMapping.IncludeInAll = false
termFieldMapping.Analyzer = analyzer_keyword.Name
Expand All @@ -90,6 +99,7 @@ func generateBleveIndexMapping() (mapping.IndexMapping, error) {
docMapping.AddFieldMappingsAt("UpdatedAt", timeFieldMapping)

mapping := bleve.NewIndexMapping()

if err := addUnicodeNormalizeTokenFilter(mapping); err != nil {
return nil, err
} else if err := mapping.AddCustomAnalyzer(repoIndexerAnalyzer, map[string]any{
Expand All @@ -100,6 +110,16 @@ func generateBleveIndexMapping() (mapping.IndexMapping, error) {
}); err != nil {
return nil, err
}

if err := mapping.AddCustomAnalyzer(filenameIndexerAnalyzer, map[string]any{
"type": analyzer_custom.Name,
"char_filters": []string{},
"tokenizer": unicode.Name,
"token_filters": []string{unicodeNormalizeName, path_filter.Name, lowercase.Name},
}); err != nil {
return nil, err
}

mapping.DefaultAnalyzer = repoIndexerAnalyzer
mapping.AddDocumentMapping(repoIndexerDocType, docMapping)
mapping.AddDocumentMapping("_all", bleve.NewDocumentDisabledMapping())
Expand Down Expand Up @@ -174,6 +194,7 @@ func (b *Indexer) addUpdate(ctx context.Context, batchWriter git.WriteCloserErro
return batch.Index(id, &RepoIndexerData{
RepoID: repo.ID,
CommitID: commitSha,
Filename: update.Filename,
Content: string(charset.ToUTF8DropErrors(fileContents, charset.ConvertOpts{})),
Language: analyze.GetCodeLanguage(update.Filename, fileContents),
UpdatedAt: time.Now().UTC(),
Expand Down Expand Up @@ -240,14 +261,19 @@ func (b *Indexer) Search(ctx context.Context, opts *internal.SearchOptions) (int
keywordQuery query.Query
)

phraseQuery := bleve.NewMatchPhraseQuery(opts.Keyword)
phraseQuery.FieldVal = "Content"
phraseQuery.Analyzer = repoIndexerAnalyzer
keywordQuery = phraseQuery
pathQuery := bleve.NewPrefixQuery(strings.ToLower(opts.Keyword))
pathQuery.FieldVal = "Filename"
pathQuery.SetBoost(10)

contentQuery := bleve.NewMatchQuery(opts.Keyword)
contentQuery.FieldVal = "Content"

if opts.IsKeywordFuzzy {
phraseQuery.Fuzziness = inner_bleve.GuessFuzzinessByKeyword(opts.Keyword)
contentQuery.Fuzziness = inner_bleve.GuessFuzzinessByKeyword(opts.Keyword)
}

keywordQuery = bleve.NewDisjunctionQuery(contentQuery, pathQuery)

if len(opts.RepoIDs) > 0 {
repoQueries := make([]query.Query, 0, len(opts.RepoIDs))
for _, repoID := range opts.RepoIDs {
Expand Down Expand Up @@ -277,7 +303,7 @@ func (b *Indexer) Search(ctx context.Context, opts *internal.SearchOptions) (int

from, pageSize := opts.GetSkipTake()
searchRequest := bleve.NewSearchRequestOptions(indexerQuery, pageSize, from, false)
searchRequest.Fields = []string{"Content", "RepoID", "Language", "CommitID", "UpdatedAt"}
searchRequest.Fields = []string{"Content", "Filename", "RepoID", "Language", "CommitID", "UpdatedAt"}
searchRequest.IncludeLocations = true

if len(opts.Language) == 0 {
Expand Down Expand Up @@ -307,6 +333,10 @@ func (b *Indexer) Search(ctx context.Context, opts *internal.SearchOptions) (int
endIndex = locationEnd
}
}
if len(hit.Locations["Filename"]) > 0 {
startIndex, endIndex = internal.FilenameMatchIndexPos(hit.Fields["Content"].(string))
}

language := hit.Fields["Language"].(string)
var updatedUnix timeutil.TimeStamp
if t, err := time.Parse(time.RFC3339, hit.Fields["UpdatedAt"].(string)); err == nil {
Expand Down
101 changes: 101 additions & 0 deletions modules/indexer/code/bleve/token/path/path.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,101 @@
// Copyright 2024 The Gitea Authors. All rights reserved.
// SPDX-License-Identifier: MIT

package path

import (
"slices"
"strings"

"github.com/blevesearch/bleve/v2/analysis"
"github.com/blevesearch/bleve/v2/registry"
)

const (
Name = "gitea/path"
)

type TokenFilter struct{}

func NewTokenFilter() *TokenFilter {
return &TokenFilter{}
}

func TokenFilterConstructor(config map[string]any, cache *registry.Cache) (analysis.TokenFilter, error) {
return NewTokenFilter(), nil
}

func (s *TokenFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
if len(input) == 1 {
// if there is only one token, we dont need to generate the reversed chain
return generatePathTokens(input, false)
}

normal := generatePathTokens(input, false)
reversed := generatePathTokens(input, true)

return append(normal, reversed...)
}

// Generates path tokens from the input tokens.
// This mimics the behavior of the path hierarchy tokenizer in ES. It takes the input tokens and combine them, generating a term for each component
// in tree (e.g., foo/bar/baz.md will generate foo, foo/bar, and foo/bar/baz.md).
//
// If the reverse flag is set, the order of the tokens is reversed (the same input will generate baz.md, baz.md/bar, baz.md/bar/foo). This is useful
// to efficiently search for filenames without supplying the fullpath.
func generatePathTokens(input analysis.TokenStream, reversed bool) analysis.TokenStream {
terms := make([]string, 0, len(input))
longestTerm := 0

if reversed {
slices.Reverse(input)
}

for i := 0; i < len(input); i++ {
var sb strings.Builder
sb.WriteString(string(input[0].Term))

for j := 1; j < i; j++ {
sb.WriteString("/")
sb.WriteString(string(input[j].Term))
}

term := sb.String()

if longestTerm < len(term) {
longestTerm = len(term)
}

terms = append(terms, term)
}

output := make(analysis.TokenStream, 0, len(terms))

for _, term := range terms {
var start, end int

if reversed {
start = 0
end = len(term)
} else {
start = longestTerm - len(term)
end = longestTerm
}

token := analysis.Token{
Position: 1,
Start: start,
End: end,
Type: analysis.AlphaNumeric,
Term: []byte(term),
}

output = append(output, &token)
}

return output
}

func init() {
registry.RegisterTokenFilter(Name, TokenFilterConstructor)
}
Loading

0 comments on commit 0de61a1

Please sign in to comment.