Skip to content

Commit

Permalink
Detect charset and convert non UTF-8 files for display (go-gitea#4950)
Browse files Browse the repository at this point in the history
* Detect charset and convert non UTF-8 files for display

* Refactor and move function to correct module

* Revert unrelated changes

* More unrelated changes

* Duplicate content for small text to have better encoding detection

* Check if original content is valid before duplicating it
  • Loading branch information
lafriks committed Sep 29, 2018
1 parent 93dcc6c commit 785b17c
Show file tree
Hide file tree
Showing 3 changed files with 44 additions and 4 deletions.
17 changes: 16 additions & 1 deletion modules/base/tool.go
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,22 @@ func DetectEncoding(content []byte) (string, error) {
return "UTF-8", nil
}

result, err := chardet.NewTextDetector().DetectBest(content)
textDetector := chardet.NewTextDetector()
var detectContent []byte
if len(content) < 1024 {
// Check if original content is valid
if _, err := textDetector.DetectBest(content); err != nil {
return "", err
}
times := 1024 / len(content)
detectContent = make([]byte, 0, times*len(content))
for i := 0; i < times; i++ {
detectContent = append(detectContent, content...)
}
} else {
detectContent = content
}
result, err := textDetector.DetectBest(detectContent)
if err != nil {
return "", err
}
Expand Down
25 changes: 24 additions & 1 deletion modules/templates/helper.go
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
// Copyright 2018 The Gitea Authors. All rights reserved.
// Copyright 2014 The Gogs Authors. All rights reserved.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
Expand Down Expand Up @@ -243,7 +244,7 @@ func ToUTF8WithErr(content []byte) (string, error) {
}

// If there is an error, we concatenate the nicely decoded part and the
// original left over. This way we won't loose data.
// original left over. This way we won't lose data.
result, n, err := transform.String(encoding.NewDecoder(), string(content))
if err != nil {
result = result + string(content[n:])
Expand All @@ -252,6 +253,28 @@ func ToUTF8WithErr(content []byte) (string, error) {
return result, err
}

// ToUTF8WithFallback detects the encoding of content and coverts to UTF-8 if possible
func ToUTF8WithFallback(content []byte) []byte {
charsetLabel, err := base.DetectEncoding(content)
if err != nil || charsetLabel == "UTF-8" {
return content
}

encoding, _ := charset.Lookup(charsetLabel)
if encoding == nil {
return content
}

// If there is an error, we concatenate the nicely decoded part and the
// original left over. This way we won't lose data.
result, n, err := transform.Bytes(encoding.NewDecoder(), content)
if err != nil {
return append(result, content[n:]...)
}

return result
}

// ToUTF8 converts content to UTF8 encoding and ignore error
func ToUTF8(content string) string {
res, _ := ToUTF8WithErr([]byte(content))
Expand Down
6 changes: 4 additions & 2 deletions routers/repo/view.go
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ import (
"code.gitea.io/gitea/modules/markup"
"code.gitea.io/gitea/modules/setting"
"code.gitea.io/gitea/modules/templates"

"github.com/Unknwon/paginater"
)

Expand Down Expand Up @@ -99,7 +100,8 @@ func renderDirectory(ctx *context.Context, treeLink string) {
ctx.Data["FileSize"] = readmeFile.Size()
} else {
d, _ := ioutil.ReadAll(dataRc)
buf = append(buf, d...)
buf = templates.ToUTF8WithFallback(append(buf, d...))

if markup.Type(readmeFile.Name()) != "" {
ctx.Data["IsMarkup"] = true
ctx.Data["FileContent"] = string(markup.Render(readmeFile.Name(), buf, treeLink, ctx.Repo.Repository.ComposeMetas()))
Expand Down Expand Up @@ -201,7 +203,7 @@ func renderFile(ctx *context.Context, entry *git.TreeEntry, treeLink, rawLink st
}

d, _ := ioutil.ReadAll(dataRc)
buf = append(buf, d...)
buf = templates.ToUTF8WithFallback(append(buf, d...))

readmeExist := markup.IsReadmeFile(blob.Name())
ctx.Data["ReadmeExist"] = readmeExist
Expand Down

0 comments on commit 785b17c

Please sign in to comment.