Skip to content

Commit

Permalink
feat(decoders): basic escaped unicode
Browse files Browse the repository at this point in the history
  • Loading branch information
rgmz authored and Richard Gomez committed Feb 14, 2024
1 parent d54cf20 commit 4cc1644
Show file tree
Hide file tree
Showing 5 changed files with 1,166 additions and 1,014 deletions.
1 change: 1 addition & 0 deletions pkg/decoders/decoders.go
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ func DefaultDecoders() []Decoder {
&UTF8{},
&Base64{},
&UTF16{},
&EscapedUnicode{},
}
}

Expand Down
73 changes: 73 additions & 0 deletions pkg/decoders/escaped_unicode.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
package decoders

import (
"regexp"
"strconv"
"unicode/utf8"

"github.com/trufflesecurity/trufflehog/v3/pkg/pb/detectorspb"
"github.com/trufflesecurity/trufflehog/v3/pkg/sources"
)

type EscapedUnicode struct{}

var _ Decoder = (*EscapedUnicode)(nil)

// It might be advantageous to limit these to a subset of acceptable characters, similar to base64.
// https://dencode.com/en/string/unicode-escape
var escapePat = regexp.MustCompile(`(?i:\\{1,2}u)([a-fA-F0-9]{4})`)

func (d *EscapedUnicode) FromChunk(chunk *sources.Chunk) *DecodableChunk {
if chunk == nil || len(chunk.Data) == 0 || !escapePat.Match(chunk.Data) {
return nil
}

decoded := decodeUnicode(chunk.Data)
if decoded == nil {
return nil
}

chunk.Data = decoded
decodableChunk := &DecodableChunk{
DecoderType: detectorspb.DecoderType_ESCAPED_UNICODE,
Chunk: chunk,
}

return decodableChunk
}

func decodeUnicode(input []byte) []byte {
// Find all Unicode escape sequences in the input byte slice
indices := escapePat.FindAllSubmatchIndex(input, -1)
if len(indices) == 0 {
return nil
}

// Iterate over found indices in reverse order to avoid modifying the slice length
//for i, matches := range indices {
for i := len(indices) - 1; i >= 0; i-- {
matches := indices[i]
startIndex := matches[0]
hexStartIndex := matches[2]
endIndex := matches[3]

// Extract the hexadecimal value from the escape sequence
hexValue := string(input[hexStartIndex:endIndex])

// Parse the hexadecimal value to an integer
unicodeInt, err := strconv.ParseInt(hexValue, 16, 32)
if err != nil {
// If there's an error, continue to the next escape sequence
continue
}

// Convert the Unicode code point to a UTF-8 representation
utf8Bytes := make([]byte, 4)
utf8Len := utf8.EncodeRune(utf8Bytes, rune(unicodeInt))

// Replace the escape sequence with the UTF-8 representation
input = append(input[:startIndex], append(utf8Bytes[:utf8Len], input[endIndex:]...)...)
}

return input
}
73 changes: 73 additions & 0 deletions pkg/decoders/escaped_unicode_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
package decoders

import (
"testing"

"github.com/kylelemons/godebug/pretty"

"github.com/trufflesecurity/trufflehog/v3/pkg/sources"
)

func TestUnicodeEscape_FromChunk(t *testing.T) {
tests := []struct {
name string
chunk *sources.Chunk
want *sources.Chunk
wantErr bool
}{
{
name: "all escaped",
chunk: &sources.Chunk{
Data: []byte("\\u0074\\u006f\\u006b\\u0065\\u006e\\u003a\\u0020\\u0022\\u0067\\u0068\\u0070\\u005f\\u0049\\u0077\\u0064\\u004d\\u0078\\u0039\\u0057\\u0046\\u0057\\u0052\\u0052\\u0066\\u004d\\u0068\\u0054\\u0059\\u0069\\u0061\\u0056\\u006a\\u005a\\u0037\\u0038\\u004a\\u0066\\u0075\\u0061\\u006d\\u0076\\u006e\\u0030\\u0059\\u0057\\u0052\\u004d\\u0030\\u0022"),
},
want: &sources.Chunk{
Data: []byte("token: \"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\""),
},
},
{
name: "mixed content",
chunk: &sources.Chunk{
Data: []byte("npm config set @trufflesec:registry=https://npm.pkg.github.com\nnpm config set //npm.pkg.github.com:_authToken=$'\\u0067hp_9ovSHEBCq0drG42yjoam76iNybtqLN25CgSf'"),
},
want: &sources.Chunk{
Data: []byte("npm config set @trufflesec:registry=https://npm.pkg.github.com\nnpm config set //npm.pkg.github.com:_authToken=$'ghp_9ovSHEBCq0drG42yjoam76iNybtqLN25CgSf'"),
},
},
{
name: "multiple slashes",
chunk: &sources.Chunk{
Data: []byte(`SameValue("hello","\\u0068el\\u006co"); // true`),
},
want: &sources.Chunk{
Data: []byte(`SameValue("hello","hello"); // true`),
},
},
{
name: "no escaped",
chunk: &sources.Chunk{
Data: []byte(`-//npm.fontawesome.com/:_authToken=12345678-2323-1111-1111-12345670B312
+//npm.fontawesome.com/:_authToken=REMOVED_TOKEN`),
},
want: nil,
},
}

for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
d := &EscapedUnicode{}
got := d.FromChunk(tt.chunk)
if tt.want != nil {
if got == nil {
t.Fatal("got nil, did not want nil")
}
if diff := pretty.Compare(string(tt.want.Data), string(got.Data)); diff != "" {
t.Errorf("UnicodeEscape.FromChunk() %s diff: (-want +got)\n%s", tt.name, diff)
}
} else {
if got != nil {
t.Error("Expected nil chunk")
}
}
})
}
}
Loading

0 comments on commit 4cc1644

Please sign in to comment.