Skip to content

Commit a674b30

Browse files
committed
Fix line diff by using runes without separators
[The suggested approach](https://github.com/google/diff-match-patch/wiki/Line-or-Word-Diffs#line-mode ) for doing line level diffing is the following set of steps: 1. `ti1, ti2, linesIdx = DiffLinesToChars(t1, t2)` 2. `diffs = DiffMain(ti1, ti2)` 3. `DiffCharsToLines(diff, linesIdx)` The original implementation in `google/diff-match-patch` uses unicode codepoints for storing indices in `ti1` and `ti2` joined by an empty string. Current implementation in this repo stores them as integers joined by a comma. While this implementation makes `ti1` and `ti2` more readable, it introduces bugs when trying to rely on it when doing line level diffing with `DiffMain`. The root cause of the issue is that an integer line index might span more than one character/rune, and `DiffMain` can assume that two different lines having the same index prefix match partially. For example, indices 123 and 129 will have partial match `12`. In that example, the diff will show lines 3 and 9 which is not correct. A simple failing test case demonstrating this issue is available at `TestDiffPartialLineIndex`. In this PR I am adjusting the algorithm to use the same approach as in [diff-match-patch](https://github.com/google/diff-match-patch/blob/62f2e689f498f9c92dbc588c58750addec9b1654/javascript/diff_match_patch_uncompressed.js#L508-L510 ) by storing each line index as a rune. While a rune in Golang is a type alias to uint32, not every uint32 can be a valid rune. During string to rune slice conversion invalid runes will be replaced with `utf.RuneError`. The integer to rune generation logic is based on the table in https://en.wikipedia.org/wiki/UTF-8#Encoding The first 127 lines will work the fastest as they are represented as a single bytes. Higher numbers are represented as 2-4 bytes. In addition to that, the range `U+D800 - U+DFFF` contains [invalid codepoints](https://en.wikipedia.org/wiki/UTF-8#Invalid_sequences_and_error_handling). and all codepoints higher or equal to `0xD800` are incremented by `0xDFFF - 0xD800`. The maximum representable integer using this approach is 1'112'060. This improves on Javascript implementation which currently [bails out](https://github.com/google/diff-match-patch/blob/62f2e689f498f9c92dbc588c58750addec9b1654/javascript/diff_match_patch_uncompressed.js#L503-L505 ) when files have more than 65535 lines.
1 parent 74798f5 commit a674b30

File tree

5 files changed

+133
-38
lines changed

5 files changed

+133
-38
lines changed

diffmatchpatch/diff.go

Lines changed: 4 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -34,8 +34,6 @@ const (
3434
DiffInsert Operation = 1
3535
// DiffEqual item represents an equal diff.
3636
DiffEqual Operation = 0
37-
//IndexSeparator is used to seperate the array indexes in an index string
38-
IndexSeparator = ","
3937
)
4038

4139
// Diff represents one diff operation
@@ -406,14 +404,11 @@ func (dmp *DiffMatchPatch) DiffLinesToRunes(text1, text2 string) ([]rune, []rune
406404
func (dmp *DiffMatchPatch) DiffCharsToLines(diffs []Diff, lineArray []string) []Diff {
407405
hydrated := make([]Diff, 0, len(diffs))
408406
for _, aDiff := range diffs {
409-
chars := strings.Split(aDiff.Text, IndexSeparator)
410-
text := make([]string, len(chars))
407+
runes := []rune(aDiff.Text)
408+
text := make([]string, len(runes))
411409

412-
for i, r := range chars {
413-
i1, err := strconv.Atoi(r)
414-
if err == nil {
415-
text[i] = lineArray[i1]
416-
}
410+
for i, r := range runes {
411+
text[i] = lineArray[runeToInt(r)]
417412
}
418413

419414
aDiff.Text = strings.Join(text, "")

diffmatchpatch/diff_test.go

Lines changed: 18 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -314,12 +314,12 @@ func TestDiffLinesToChars(t *testing.T) {
314314
dmp := New()
315315

316316
for i, tc := range []TestCase{
317-
{"", "alpha\r\nbeta\r\n\r\n\r\n", "", "1,2,3,3", []string{"", "alpha\r\n", "beta\r\n", "\r\n"}},
318-
{"a", "b", "1", "2", []string{"", "a", "b"}},
317+
{"", "alpha\r\nbeta\r\n\r\n\r\n", "", "\x01\x02\x03\x03", []string{"", "alpha\r\n", "beta\r\n", "\r\n"}},
318+
{"a", "b", "\x01", "\x02", []string{"", "a", "b"}},
319319
// Omit final newline.
320-
{"alpha\nbeta\nalpha", "", "1,2,3", "", []string{"", "alpha\n", "beta\n", "alpha"}},
320+
{"alpha\nbeta\nalpha", "", "\x01\x02\x03", "", []string{"", "alpha\n", "beta\n", "alpha"}},
321321
// Same lines in Text1 and Text2
322-
{"abc\ndefg\n12345\n", "abc\ndef\n12345\n678", "1,2,3", "1,4,3,5", []string{"", "abc\n", "defg\n", "12345\n", "def\n", "678"}},
322+
{"abc\ndefg\n12345\n", "abc\ndef\n12345\n678", "\x01\x02\x03", "\x01\x04\x03\x05", []string{"", "abc\n", "defg\n", "12345\n", "def\n", "678"}},
323323
} {
324324
actualChars1, actualChars2, actualLines := dmp.DiffLinesToChars(tc.Text1, tc.Text2)
325325
assert.Equal(t, tc.ExpectedChars1, actualChars1, fmt.Sprintf("Test case #%d, %#v", i, tc))
@@ -332,14 +332,13 @@ func TestDiffLinesToChars(t *testing.T) {
332332
lineList := []string{
333333
"", // Account for the initial empty element of the lines array.
334334
}
335-
var charList []string
335+
var charList []rune
336336
for x := 1; x < n+1; x++ {
337337
lineList = append(lineList, strconv.Itoa(x)+"\n")
338-
charList = append(charList, strconv.Itoa(x))
338+
charList = append(charList, rune(x))
339339
}
340340
lines := strings.Join(lineList, "")
341-
chars := strings.Join(charList[:], ",")
342-
assert.Equal(t, n, len(strings.Split(chars, ",")))
341+
chars := string(charList)
343342

344343
actualChars1, actualChars2, actualLines := dmp.DiffLinesToChars(lines, "")
345344
assert.Equal(t, chars, actualChars1)
@@ -360,8 +359,8 @@ func TestDiffCharsToLines(t *testing.T) {
360359
for i, tc := range []TestCase{
361360
{
362361
Diffs: []Diff{
363-
{DiffEqual, "1,2,1"},
364-
{DiffInsert, "2,1,2"},
362+
{DiffEqual, "\x01\x02\x01"},
363+
{DiffInsert, "\x02\x01\x02"},
365364
},
366365
Lines: []string{"", "alpha\n", "beta\n"},
367366

@@ -380,13 +379,12 @@ func TestDiffCharsToLines(t *testing.T) {
380379
lineList := []string{
381380
"", // Account for the initial empty element of the lines array.
382381
}
383-
charList := []string{}
382+
charList := []rune{}
384383
for x := 1; x <= n; x++ {
385384
lineList = append(lineList, strconv.Itoa(x)+"\n")
386-
charList = append(charList, strconv.Itoa(x))
385+
charList = append(charList, rune(x))
387386
}
388-
assert.Equal(t, n, len(charList))
389-
chars := strings.Join(charList[:], ",")
387+
chars := string(charList)
390388

391389
actual := dmp.DiffCharsToLines([]Diff{Diff{DiffDelete, chars}}, lineList)
392390
assert.Equal(t, []Diff{Diff{DiffDelete, strings.Join(lineList, "")}}, actual)
@@ -1471,7 +1469,7 @@ func TestMassiveRuneDiffConversion(t *testing.T) {
14711469
func TestDiffPartialLineIndex(t *testing.T) {
14721470
dmp := New()
14731471
t1, t2, tt := dmp.DiffLinesToChars(
1474-
`line 1
1472+
`line 1
14751473
line 2
14761474
line 3
14771475
line 4
@@ -1481,7 +1479,7 @@ line 7
14811479
line 8
14821480
line 9
14831481
line 10 text1`,
1484-
`line 1
1482+
`line 1
14851483
line 2
14861484
line 3
14871485
line 4
@@ -1494,10 +1492,10 @@ line 10 text2`)
14941492
diffs := dmp.DiffMain(t1, t2, false)
14951493
diffs = dmp.DiffCharsToLines(diffs, tt)
14961494
assert.Equal(t, []Diff{
1497-
Diff{DiffEqual, "line 1\nline 2\nline 3\nline 4\nline 5\nline 6\nline 7\nline 8\nline 9\n"},
1498-
Diff{DiffDelete, "line 10 text1"},
1499-
Diff{DiffInsert, "line 10 text2"},
1500-
}, diffs)
1495+
Diff{DiffEqual, "line 1\nline 2\nline 3\nline 4\nline 5\nline 6\nline 7\nline 8\nline 9\n"},
1496+
Diff{DiffDelete, "line 10 text1"},
1497+
Diff{DiffInsert, "line 10 text2"},
1498+
}, diffs)
15011499
}
15021500

15031501
func BenchmarkDiffMain(bench *testing.B) {

diffmatchpatch/stringutil.go

Lines changed: 92 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -9,11 +9,16 @@
99
package diffmatchpatch
1010

1111
import (
12-
"strconv"
12+
"fmt"
1313
"strings"
1414
"unicode/utf8"
1515
)
1616

17+
const UNICODE_INVALID_RANGE_START = 0xD800
18+
const UNICODE_INVALID_RANGE_END = 0xDFFF
19+
const UNICODE_INVALID_RANGE_DELTA = UNICODE_INVALID_RANGE_END - UNICODE_INVALID_RANGE_START + 1
20+
const UNICODE_RANGE_MAX = 0x10FFFF
21+
1722
// unescaper unescapes selected chars for compatibility with JavaScript's encodeURI.
1823
// In speed critical applications this could be dropped since the receiving application will certainly decode these fine. Note that this function is case-sensitive. Thus "%3F" would not be unescaped. But this is ok because it is only called with the output of HttpUtility.UrlEncode which returns lowercase hex. Example: "%3f" -> "?", "%24" -> "$", etc.
1924
var unescaper = strings.NewReplacer(
@@ -93,14 +98,93 @@ func intArrayToString(ns []uint32) string {
9398
return ""
9499
}
95100

96-
indexSeparator := IndexSeparator[0]
97-
98-
// Appr. 3 chars per num plus the comma.
99-
b := []byte{}
101+
b := []rune{}
100102
for _, n := range ns {
101-
b = strconv.AppendInt(b, int64(n), 10)
102-
b = append(b, indexSeparator)
103+
b = append(b, intToRune(n))
103104
}
104-
b = b[:len(b)-1]
105105
return string(b)
106106
}
107+
108+
// These constants define the number of bits representable
109+
// in 1,2,3,4 byte utf8 sequences, respectively.
110+
const ONE_BYTE_BITS = 7
111+
const TWO_BYTE_BITS = 11
112+
const THREE_BYTE_BITS = 16
113+
const FOUR_BYTE_BITS = 21
114+
115+
// Helper for getting a sequence of bits from an integer.
116+
func getBits(i uint32, cnt byte, from byte) byte {
117+
return byte((i >> from) & ((1 << cnt) - 1))
118+
}
119+
120+
// Converts an integer in the range 0~1112060 into a rune.
121+
// Based on the ranges table in https://en.wikipedia.org/wiki/UTF-8
122+
func intToRune(i uint32) rune {
123+
if i < (1 << ONE_BYTE_BITS) {
124+
return rune(i)
125+
}
126+
127+
if i < (1 << TWO_BYTE_BITS) {
128+
r, size := utf8.DecodeRune([]byte{0b11000000 | getBits(i, 5, 6), 0b10000000 | getBits(i, 6, 0)})
129+
if size != 2 || r == utf8.RuneError {
130+
panic(fmt.Sprintf("Error encoding an int %d with size 2, got rune %v and size %d", size, r, i))
131+
}
132+
return r
133+
}
134+
135+
// Last -3 here needed because for some reason 3rd to last codepoint 65533 in this range
136+
// was returning utf8.RuneError during encoding.
137+
if i < ((1 << THREE_BYTE_BITS) - UNICODE_INVALID_RANGE_DELTA - 3) {
138+
if i >= UNICODE_INVALID_RANGE_START {
139+
i += UNICODE_INVALID_RANGE_DELTA
140+
}
141+
142+
r, size := utf8.DecodeRune([]byte{0b11100000 | getBits(i, 4, 12), 0b10000000 | getBits(i, 6, 6), 0b10000000 | getBits(i, 6, 0)})
143+
if size != 3 || r == utf8.RuneError {
144+
panic(fmt.Sprintf("Error encoding an int %d with size 3, got rune %v and size %d", size, r, i))
145+
}
146+
return r
147+
}
148+
149+
if i < (1<<FOUR_BYTE_BITS - UNICODE_INVALID_RANGE_DELTA - 3) {
150+
i += UNICODE_INVALID_RANGE_DELTA + 3
151+
r, size := utf8.DecodeRune([]byte{0b11110000 | getBits(i, 3, 18), 0b10000000 | getBits(i, 6, 12), 0b10000000 | getBits(i, 6, 6), 0b10000000 | getBits(i, 6, 0)})
152+
if size != 4 || r == utf8.RuneError {
153+
panic(fmt.Sprintf("Error encoding an int %d with size 4, got rune %v and size %d", size, r, i))
154+
}
155+
return r
156+
}
157+
panic(fmt.Sprintf("The integer %d is too large for runeToInt()", i))
158+
}
159+
160+
// Converts a rune generated by intToRune back to an integer
161+
func runeToInt(r rune) uint32 {
162+
i := uint32(r)
163+
if i < (1 << ONE_BYTE_BITS) {
164+
return i
165+
}
166+
167+
bytes := []byte{0, 0, 0, 0}
168+
169+
size := utf8.EncodeRune(bytes, r)
170+
171+
if size == 2 {
172+
return uint32(bytes[0]&0b11111)<<6 | uint32(bytes[1]&0b111111)
173+
}
174+
175+
if size == 3 {
176+
result := uint32(bytes[0]&0b1111)<<12 | uint32(bytes[1]&0b111111)<<6 | uint32(bytes[2]&0b111111)
177+
if result >= UNICODE_INVALID_RANGE_END {
178+
return result - UNICODE_INVALID_RANGE_DELTA
179+
}
180+
181+
return result
182+
}
183+
184+
if size == 4 {
185+
result := uint32(bytes[0]&0b111)<<18 | uint32(bytes[1]&0b111111)<<12 | uint32(bytes[2]&0b111111)<<6 | uint32(bytes[3]&0b111111)
186+
return result - UNICODE_INVALID_RANGE_DELTA - 3
187+
}
188+
189+
panic(fmt.Sprintf("Unexpected state decoding rune=%v size=%d", r, size))
190+
}

diffmatchpatch/stringutil_test.go

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -114,3 +114,21 @@ func TestLastIndexOf(t *testing.T) {
114114
assert.Equal(t, tc.Expected, actual, fmt.Sprintf("Test case #%d, %#v", i, tc))
115115
}
116116
}
117+
118+
// Exhaustive check for all ints from 0 to 1112060 for correctness of implementation
119+
// of `intToRune() -> runeToInt()`.
120+
// This test is slow and runs longer than 5 seconds but it does provide a safety
121+
// guarantee that these 2 functions are correct for the ranges we support.
122+
func TestRuneToInt(t *testing.T) {
123+
124+
for i := uint32(0); i <= UNICODE_RANGE_MAX-UNICODE_INVALID_RANGE_DELTA-3; i += 1 {
125+
r := intToRune(i)
126+
ic := runeToInt(r)
127+
128+
assert.Equal(t, i, ic, fmt.Sprintf("intToRune(%d)=%d and runeToInt(%d)=%d", i, r, r, ic))
129+
}
130+
131+
assert.Panics(t, func() {
132+
intToRune(UNICODE_RANGE_MAX - UNICODE_INVALID_RANGE_DELTA - 2)
133+
})
134+
}

go.mod

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,4 +8,4 @@ require (
88
gopkg.in/yaml.v2 v2.4.0 // indirect
99
)
1010

11-
go 1.12
11+
go 1.13

0 commit comments

Comments
 (0)