-
Notifications
You must be signed in to change notification settings - Fork 97
/
Copy pathschema.go
80 lines (67 loc) · 2.54 KB
/
schema.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
package schema
import (
"bytes"
"fmt"
"path/filepath"
"regexp"
"strings"
"unicode/utf8"
"github.com/buildbuddy-io/buildbuddy/codesearch/types"
"github.com/buildbuddy-io/buildbuddy/server/util/git"
"github.com/gabriel-vasile/mimetype"
"github.com/go-enry/go-enry/v2"
xxhash "github.com/cespare/xxhash/v2"
)
const (
maxFileLen = 10_000_000
// The maximum amount of bytes from a file to use for language and
// mimetype detection.
detectionBufferSize = 1000
// The following field names are used in the indexed docs.
IDField = "id"
FilenameField = "filename"
ContentField = "content"
LanguageField = "language"
OwnerField = "owner"
RepoField = "repo"
SHAField = "sha"
)
// TODO(tylerw): this should come from a flag?
var skipMime = regexp.MustCompile(`^audio/.*|video/.*|image/.*|application/gzip$`)
func MakeDocument(name, commitSha string, repoURL *git.RepoURL, buf []byte) (types.Document, error) {
// Skip long files.
if len(buf) > maxFileLen {
return nil, fmt.Errorf("skipping %s (file too long)", name)
}
var shortBuf []byte
if len(buf) > detectionBufferSize {
shortBuf = buf[:detectionBufferSize]
} else {
shortBuf = buf
}
// Check the mimetype and skip if bad.
mtype, err := mimetype.DetectReader(bytes.NewReader(shortBuf))
if err == nil && skipMime.MatchString(mtype.String()) {
return nil, fmt.Errorf("skipping %s (invalid mime type: %q)", name, mtype.String())
}
// Skip non-utf8 encoded files.
if !utf8.Valid(buf) {
return nil, fmt.Errorf("skipping %s (non-utf8 content)", name)
}
uniqueID := xxhash.Sum64String(repoURL.Owner + repoURL.Repo + name)
idBytes := []byte(fmt.Sprintf("%d", uniqueID))
// Compute filetype
lang := strings.ToLower(enry.GetLanguage(filepath.Base(name), shortBuf))
doc := types.NewMapDocument(
map[string]types.NamedField{
IDField: types.NewNamedField(types.KeywordField, IDField, idBytes, false /*=stored*/),
FilenameField: types.NewNamedField(types.TrigramField, FilenameField, []byte(name), true /*=stored*/),
ContentField: types.NewNamedField(types.SparseNgramField, ContentField, buf, true /*=stored*/),
LanguageField: types.NewNamedField(types.KeywordField, LanguageField, []byte(lang), true /*=stored*/),
OwnerField: types.NewNamedField(types.KeywordField, OwnerField, []byte(repoURL.Owner), true /*=stored*/),
RepoField: types.NewNamedField(types.KeywordField, RepoField, []byte(repoURL.Repo), true /*=stored*/),
SHAField: types.NewNamedField(types.KeywordField, SHAField, []byte(commitSha), true /*=stored*/),
},
)
return doc, nil
}