Skip to content

Commit

Permalink
Print as text if mostly text (#258)
Browse files Browse the repository at this point in the history
The previous heuristic of treating strings as binary data
if it contains any invalid UTF-8 was too strict.
Loosen the heuristic to check if most of the characters
are printable text.

Fixes #257
  • Loading branch information
dsnet authored May 25, 2021
1 parent 9181d1e commit d103655
Show file tree
Hide file tree
Showing 3 changed files with 42 additions and 17 deletions.
5 changes: 5 additions & 0 deletions cmp/compare_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1307,6 +1307,11 @@ using the AllowUnexported option.`, "\n"),
x: "org-4747474747474747,bucket-4242424242424242:m,tag1=a,tag2=aa,#=_value _value=2 11\torg-4747474747474747,bucket-4242424242424242:m,tag1=a,tag2=bb,#=_value _value=2 21\torg-4747474747474747,bucket-4242424242424242:m,tag1=b,tag2=cc,#=_value _value=1 21\torg-4747474747474747,bucket-4242424242424242:m,tag1=a,tag2=dd,#=_value _value=3 31\torg-4747474747474747,bucket-4242424242424242:m,tag1=c,#=_value _value=4 41\t",
y: "org-4747474747474747,bucket-4242424242424242:m,tag1=a,tag2=aa _value=2 11\torg-4747474747474747,bucket-4242424242424242:m,tag1=a,tag2=bb _value=2 21\torg-4747474747474747,bucket-4242424242424242:m,tag1=b,tag2=cc _value=1 21\torg-4747474747474747,bucket-4242424242424242:m,tag1=a,tag2=dd _value=3 31\torg-4747474747474747,bucket-4242424242424242:m,tag1=c _value=4 41\t",
reason: "leading/trailing equal spans should not appear in diff lines",
}, {
label: label + "/MostlyTextString",
x: "org-4747474747474747,bucket-4242424242424242:m,tag1=a,tag2=aa,\xff=_value _value=2 11\norg-4747474747474747,bucket-4242424242424242:m,tag1=a,tag2=bb,\xff=_value _value=2 21\norg-4747474747474747,bucket-4242424242424242:m,tag1=b,tag2=cc,\xff=_value _value=1 21\norg-4747474747474747,bucket-4242424242424242:m,tag1=a,tag2=dd,\xff=_value _value=3 31\norg-4747474747474747,bucket-4242424242424242:m,tag1=c,\xff=_value _value=4 41\n",
y: "org-4747474747474747,bucket-4242424242424242:m,tag1=a,tag2=aa _value=2 11\norg-4747474747474747,bucket-4242424242424242:m,tag1=a,tag2=bb _value=2 21\norg-4747474747474747,bucket-4242424242424242:m,tag1=b,tag2=cc _value=1 21\norg-4747474747474747,bucket-4242424242424242:m,tag1=a,tag2=dd _value=3 31\norg-4747474747474747,bucket-4242424242424242:m,tag1=c _value=4 41\n",
reason: "the presence of a few invalid UTF-8 characters should not prevent printing this as text",
}, {
label: label + "/AllLinesDiffer",
x: "d5c14bdf6bac81c27afc5429500ed750\n25483503b557c606dad4f144d27ae10b\n90bdbcdbb6ea7156068e3dcfb7459244\n978f480a6e3cced51e297fbff9a506b7\n",
Expand Down
35 changes: 18 additions & 17 deletions cmp/report_slices.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ package cmp
import (
"bytes"
"fmt"
"math"
"reflect"
"strconv"
"strings"
Expand Down Expand Up @@ -96,30 +97,29 @@ func (opts formatOptions) FormatDiffSlice(v *valueNode) textNode {
}

// Auto-detect the type of the data.
var isLinedText, isText, isBinary bool
var sx, sy string
var ssx, ssy []string
var isString, isMostlyText, isPureLinedText, isBinary bool
switch {
case t.Kind() == reflect.String:
sx, sy = vx.String(), vy.String()
isText = true // Initial estimate, verify later
isString = true
case t.Kind() == reflect.Slice && t.Elem() == reflect.TypeOf(byte(0)):
sx, sy = string(vx.Bytes()), string(vy.Bytes())
isBinary = true // Initial estimate, verify later
isString = true
case t.Kind() == reflect.Array:
// Arrays need to be addressable for slice operations to work.
vx2, vy2 := reflect.New(t).Elem(), reflect.New(t).Elem()
vx2.Set(vx)
vy2.Set(vy)
vx, vy = vx2, vy2
}
if isText || isBinary {
var numLines, lastLineIdx, maxLineLen int
isBinary = !utf8.ValidString(sx) || !utf8.ValidString(sy)
if isString {
var numTotalRunes, numValidRunes, numLines, lastLineIdx, maxLineLen int
for i, r := range sx + sy {
if !(unicode.IsPrint(r) || unicode.IsSpace(r)) || r == utf8.RuneError {
isBinary = true
break
numTotalRunes++
if (unicode.IsPrint(r) || unicode.IsSpace(r)) && r != utf8.RuneError {
numValidRunes++
}
if r == '\n' {
if maxLineLen < i-lastLineIdx {
Expand All @@ -129,12 +129,14 @@ func (opts formatOptions) FormatDiffSlice(v *valueNode) textNode {
numLines++
}
}
isText = !isBinary
isLinedText = isText && numLines >= 4 && maxLineLen <= 1024
isPureText := numValidRunes == numTotalRunes
isMostlyText = float64(numValidRunes) > math.Floor(0.90*float64(numTotalRunes))
isPureLinedText = isPureText && numLines >= 4 && maxLineLen <= 1024
isBinary = !isMostlyText

// Avoid diffing by lines if it produces a significantly more complex
// edit script than diffing by bytes.
if isLinedText {
if isPureLinedText {
ssx = strings.Split(sx, "\n")
ssy = strings.Split(sy, "\n")
esLines := diff.Difference(len(ssx), len(ssy), func(ix, iy int) diff.Result {
Expand All @@ -145,7 +147,7 @@ func (opts formatOptions) FormatDiffSlice(v *valueNode) textNode {
})
efficiencyLines := float64(esLines.Dist()) / float64(len(esLines))
efficiencyBytes := float64(esBytes.Dist()) / float64(len(esBytes))
isLinedText = efficiencyLines < 4*efficiencyBytes
isPureLinedText = efficiencyLines < 4*efficiencyBytes
}
}

Expand All @@ -155,7 +157,7 @@ func (opts formatOptions) FormatDiffSlice(v *valueNode) textNode {
switch {
// If the text appears to be multi-lined text,
// then perform differencing across individual lines.
case isLinedText:
case isPureLinedText:
list = opts.formatDiffSlice(
reflect.ValueOf(ssx), reflect.ValueOf(ssy), 1, "line",
func(v reflect.Value, d diffMode) textRecord {
Expand Down Expand Up @@ -244,15 +246,14 @@ func (opts formatOptions) FormatDiffSlice(v *valueNode) textNode {
// If the text appears to be single-lined text,
// then perform differencing in approximately fixed-sized chunks.
// The output is printed as quoted strings.
case isText:
case isMostlyText:
list = opts.formatDiffSlice(
reflect.ValueOf(sx), reflect.ValueOf(sy), 64, "byte",
func(v reflect.Value, d diffMode) textRecord {
s := formatString(v.String())
return textRecord{Diff: d, Value: textLine(s)}
},
)
delim = ""

// If the text appears to be binary data,
// then perform differencing in approximately fixed-sized chunks.
Expand Down Expand Up @@ -314,7 +315,7 @@ func (opts formatOptions) FormatDiffSlice(v *valueNode) textNode {

// Wrap the output with appropriate type information.
var out textNode = &textWrap{Prefix: "{", Value: list, Suffix: "}"}
if !isText {
if !isMostlyText {
// The "{...}" byte-sequence literal is not valid Go syntax for strings.
// Emit the type for extra clarity (e.g. "string{...}").
if t.Kind() == reflect.String {
Expand Down
19 changes: 19 additions & 0 deletions cmp/testdata/diffs
Original file line number Diff line number Diff line change
Expand Up @@ -1065,6 +1065,25 @@
` _value=4 41 `,
}, "")
>>> TestDiff/Reporter/SurroundingEqualElements
<<< TestDiff/Reporter/MostlyTextString
strings.Join({
"org-4747474747474747,bucket-4242424242424242:m,tag1=a,tag2=aa",
- ",\xff=_value",
" _value=2 11\norg-4747474747474747,bucket-4242424242424242:m,tag1",
"=a,tag2=bb",
- ",\xff=_value",
" _value=2 21\norg-4747474747474747,bucket-4242424242424242:m,tag1",
"=b,tag2=cc",
- ",\xff=_value",
" _value=1 21\norg-4747474747474747,bucket-4242424242424242:m,tag1",
"=a,tag2=dd",
- ",\xff=_value",
" _value=3 31\norg-4747474747474747,bucket-4242424242424242:m,tag1",
"=c",
- ",\xff=_value",
" _value=4 41\n",
}, "")
>>> TestDiff/Reporter/MostlyTextString
<<< TestDiff/Reporter/AllLinesDiffer
strings.Join({
+ "X",
Expand Down

0 comments on commit d103655

Please sign in to comment.