Merge pull request ianbruene#7 from FrnchFrgg/master

ianbruene · web-flow · commit 4ca4252ef238 · 2019-11-09T07:58:55.000-06:00
Improve population of the "line contents -&gt; indices in b" map
diff --git a/difflib/bytes/bytes.go b/difflib/bytes/bytes.go
@@ -49,8 +49,8 @@ func calculateRatio(matches, length int) float64 {
 
 func listifyString(str []byte) (lst [][]byte) {
 	lst = make([][]byte, len(str))
-	for i, c := range str {
-		lst[i] = []byte{c}
+	for i := range str {
+		lst[i] = str[i:i+1]
 	}
 	return lst
 }
@@ -84,62 +84,79 @@ type B2J struct {
 	b [][]byte
 }
 
-func newB2J (b [][]byte) *B2J {
-	b2j := B2J{store: map[lineHash] [][]int{}, b: b}
-	for lineno, line := range b {
-		h := _hash(line)
+type lineType int8
+const (
+	lineNONE    lineType =  0
+	lineNORMAL  lineType =  1
+	lineJUNK    lineType = -1
+	linePOPULAR lineType = -2
+)
+
+func (b2j *B2J) _find(line *[]byte) (h lineHash, slotIndex int,
+                                     slot []int, lt lineType) {
+	h = _hash(*line)
+	for slotIndex, slot = range b2j.store[h] {
 		// Thanks to the qualities of sha1, the probability of having more than
 		// one line content with the same hash is very low. Nevertheless, store
 		// each of them in a different slot, that we can differentiate by
 		// looking at the line contents in the b slice.
-		for slotIndex, slot := range b2j.store[h] {
-			if bytes.Equal(line, b[slot[0]]) {
-				// The content already has a slot in its hash bucket. Just
-				// append the newly seen index to the slice in that slot
-				b2j.store[h][slotIndex] = append(slot, lineno)
-				goto cont
+		// In place of all the line numbers where the line appears, a slot can
+		// also contain [lineno, -1] if b[lineno] is junk.
+		if bytes.Equal(*line, b2j.b[slot[0]]) {
+			// The content already has a slot in its hash bucket.
+			if len(slot) == 2 && slot[1] < 0 {
+				lt = lineType(slot[1])
+			} else {
+				lt = lineNORMAL
 			}
+			return // every return variable has the correct value
 		}
-		// The line content still has no slot. Create one with a single value.
-		b2j.store[h] = append(b2j.store[h], []int{lineno})
-		cont:
 	}
-	return &b2j
+	// The line content still has no slot.
+	slotIndex = -1
+	slot = nil
+	lt = lineNONE
+	return
 }
 
-func (b2j *B2J) get(line []byte) []int {
-	// Thanks to the qualities of sha1, there should be very few (zero or one)
-	// slots, so the following loop is fast.
-	for _, slot := range b2j.store[_hash(line)] {
-		if bytes.Equal(line, b2j.b[slot[0]]) {
-			return slot
-		}
+func newB2J (b [][]byte, isJunk func([]byte) bool, autoJunk bool) *B2J {
+	b2j := B2J{store: map[lineHash] [][]int{}, b: b}
+	ntest := len(b)
+	if autoJunk && ntest >= 200 {
+		ntest = ntest/100 + 1
 	}
-	return []int{}
-}
-
-func (b2j *B2J) delete(line []byte) {
-	h := _hash(line)
-	slots := b2j.store[h]
-	for slotIndex, slot := range slots {
-		if bytes.Equal(line, b2j.b[slot[0]]) {
-			// Remove the whole slot from the list of slots
-			b2j.store[h] = append(slots[:slotIndex], slots[slotIndex+1:]...)
-			return
+	for lineno, line := range b {
+		h, slotIndex, slot, lt := b2j._find(&line)
+		switch lt {
+		case lineNORMAL:
+			if len(slot) >= ntest {
+				b2j.store[h][slotIndex] = []int{slot[0], int(linePOPULAR)}
+			} else {
+				b2j.store[h][slotIndex] = append(slot, lineno)
+			}
+		case lineNONE:
+			if isJunk != nil && isJunk(line) {
+				b2j.store[h] = append(b2j.store[h], []int{lineno, int(lineJUNK)})
+			} else {
+				b2j.store[h] = append(b2j.store[h], []int{lineno})
+			}
+		default:
 		}
 	}
+	return &b2j
 }
 
-func (b2j *B2J) deleteHash(h lineHash) {
-	delete(b2j.store, h)
+func (b2j *B2J) get(line []byte) []int {
+	_, _, slot, lt := b2j._find(&line)
+	if lt == lineNORMAL {
+		return slot
+	}
+	return []int{}
 }
 
-func (b2j *B2J) iter(hook func([]byte, []int)) {
-	for _, slots := range b2j.store {
-		for _, slot := range slots {
-			hook(b2j.b[slot[0]], slot)
-		}
-	}
+func (b2j *B2J) isBJunk(line []byte) bool {
+	_, _, _, lt := b2j._find(&line)
+	return lt == lineJUNK
 }
 
 // SequenceMatcher compares sequence of strings. The basic
@@ -174,10 +191,8 @@ type SequenceMatcher struct {
 	b2j            B2J
 	IsJunk         func([]byte) bool
 	autoJunk       bool
-	bJunk          map[lineHash]struct{}
 	matchingBlocks []Match
 	fullBCount     map[lineHash]int
-	bPopular       []int
 	opCodes        []OpCode
 }
 
@@ -234,45 +249,10 @@ func (m *SequenceMatcher) SetSeq2(b [][]byte) {
 
 func (m *SequenceMatcher) chainB() {
 	// Populate line -> index mapping
-	b2j := *newB2J(m.b)
-
-	// Purge junk elements
-	m.bJunk = map[lineHash]struct{}{}
-	if m.IsJunk != nil {
-		junk := m.bJunk
-		b2j.iter(func (s []byte, _ []int){
-			if m.IsJunk(s) {
-				junk[_hash(s)] = struct{}{}
-			}
-		})
-		for h, _ := range junk {
-			b2j.deleteHash(h)
-		}
-	}
-
-	// Purge remaining popular elements
-	popular := []int{}
-	n := len(m.b)
-	if m.autoJunk && n >= 200 {
-		ntest := n/100 + 1
-		b2j.iter(func (s []byte, indices []int){
-			if len(indices) > ntest {
-				popular = append(popular, indices[0])
-			}
-		})
-		for _, i := range popular {
-			b2j.delete(m.b[i])
-		}
-	}
-	m.bPopular = popular
+	b2j := *newB2J(m.b, m.IsJunk, m.autoJunk)
 	m.b2j = b2j
 }
 
-func (m *SequenceMatcher) isBJunk(s []byte) bool {
-	_, ok := m.bJunk[_hash(s)]
-	return ok
-}
-
 // Find longest matching block in a[alo:ahi] and b[blo:bhi].
 //
 // If IsJunk is not defined:
@@ -340,12 +320,12 @@ func (m *SequenceMatcher) findLongestMatch(alo, ahi, blo, bhi int) Match {
 	// "popular" non-junk elements aren't in b2j, which greatly speeds
 	// the inner loop above, but also means "the best" match so far
 	// doesn't contain any junk *or* popular non-junk elements.
-	for besti > alo && bestj > blo && !m.isBJunk(m.b[bestj-1]) &&
+	for besti > alo && bestj > blo && !m.b2j.isBJunk(m.b[bestj-1]) &&
 		bytes.Equal(m.a[besti-1], m.b[bestj-1]) {
 		besti, bestj, bestsize = besti-1, bestj-1, bestsize+1
 	}
 	for besti+bestsize < ahi && bestj+bestsize < bhi &&
-		!m.isBJunk(m.b[bestj+bestsize]) &&
+		!m.b2j.isBJunk(m.b[bestj+bestsize]) &&
 		bytes.Equal(m.a[besti+bestsize], m.b[bestj+bestsize]) {
 		bestsize += 1
 	}
@@ -357,12 +337,12 @@ func (m *SequenceMatcher) findLongestMatch(alo, ahi, blo, bhi int) Match {
 	// figuring out what to do with it.  In the case of an empty
 	// interesting match, this is clearly the right thing to do,
 	// because no other kind of match is possible in the regions.
-	for besti > alo && bestj > blo && m.isBJunk(m.b[bestj-1]) &&
+	for besti > alo && bestj > blo && m.b2j.isBJunk(m.b[bestj-1]) &&
 		bytes.Equal(m.a[besti-1], m.b[bestj-1]) {
 		besti, bestj, bestsize = besti-1, bestj-1, bestsize+1
 	}
 	for besti+bestsize < ahi && bestj+bestsize < bhi &&
-		m.isBJunk(m.b[bestj+bestsize]) &&
+		m.b2j.isBJunk(m.b[bestj+bestsize]) &&
 		bytes.Equal(m.a[besti+bestsize], m.b[bestj+bestsize]) {
 		bestsize += 1
 	}
diff --git a/difflib/bytes/bytes_test.go b/difflib/bytes/bytes_test.go
@@ -7,6 +7,7 @@ import (
 	"reflect"
 	"strings"
 	"testing"
+	"sort"
 )
 
 func assertAlmostEqual(t *testing.T, a, b float64, places int) {
@@ -234,20 +235,41 @@ func rep(s string, count int) string {
 	return strings.Repeat(s, count)
 }
 
+func getall(b2j *B2J, lt lineType) [][]byte {
+	result := []int{}
+	for _, slots := range b2j.store {
+		for _, slot := range slots {
+			slt := lineNORMAL
+			if len(slot) == 2 && slot[1] < 0 {
+				slt = lineType(slot[1])
+			}
+			if lt == slt {
+				result = append(result, slot[0])
+			}
+		}
+	}
+	sort.Ints(result)
+	lines := make([][]byte, len(result))
+	for i, lineno := range result {
+		lines[i] = b2j.b[lineno]
+	}
+	return lines
+}
+
 func TestWithAsciiOneInsert(t *testing.T) {
 	sm := NewMatcher(splitChars(rep("b", 100)),
 		splitChars("a"+rep("b", 100)))
 	assertAlmostEqual(t, sm.Ratio(), 0.995, 3)
 	assertEqual(t, sm.GetOpCodes(),
 		[]OpCode{{'i', 0, 0, 0, 1}, {'e', 0, 100, 1, 101}})
-	assertEqual(t, len(sm.bPopular), 0)
+	assertEqual(t, len(getall(&sm.b2j, linePOPULAR)), 0)
 
 	sm = NewMatcher(splitChars(rep("b", 100)),
 		splitChars(rep("b", 50)+"a"+rep("b", 50)))
 	assertAlmostEqual(t, sm.Ratio(), 0.995, 3)
 	assertEqual(t, sm.GetOpCodes(),
 		[]OpCode{{'e', 0, 50, 0, 50}, {'i', 50, 50, 50, 51}, {'e', 50, 100, 51, 101}})
-	assertEqual(t, len(sm.bPopular), 0)
+	assertEqual(t, len(getall(&sm.b2j, linePOPULAR)), 0)
 }
 
 func TestWithAsciiOnDelete(t *testing.T) {
@@ -264,18 +286,18 @@ func TestWithAsciiBJunk(t *testing.T) {
 	}
 	sm := NewMatcherWithJunk(splitChars(rep("a", 40)+rep("b", 40)),
 		splitChars(rep("a", 44)+rep("b", 40)), true, isJunk)
-	assertEqual(t, sm.bJunk, map[lineHash]struct{}{})
+	assertEqual(t, getall(&sm.b2j, lineJUNK), [][]byte{})
 
 	sm = NewMatcherWithJunk(splitChars(rep("a", 40)+rep("b", 40)),
 		splitChars(rep("a", 44)+rep("b", 40)+rep(" ", 20)), false, isJunk)
-	assertEqual(t, sm.bJunk, map[lineHash]struct{}{_hash(SPACE): struct{}{}})
+	assertEqual(t, getall(&sm.b2j, lineJUNK), [][]byte{SPACE})
 
 	isJunk = func(s []byte) bool {
 		return len(s) == 1 && (s[0] == ' ' || s[0] == 'b')
 	}
 	sm = NewMatcherWithJunk(splitChars(rep("a", 40)+rep("b", 40)),
 		splitChars(rep("a", 44)+rep("b", 40)+rep(" ", 20)), false, isJunk)
-	assertEqual(t, sm.bJunk, map[lineHash]struct{}{_hash(SPACE): struct{}{}, _hash([]byte{'b'}): struct{}{}})
+	assertEqual(t, getall(&sm.b2j, lineJUNK), [][]byte{[]byte{'b'}, SPACE})
 }
 
 func TestSFBugsRatioForNullSeqn(t *testing.T) {
diff --git a/difflib/difflib.go b/difflib/difflib.go
@@ -100,10 +100,10 @@ type SequenceMatcher struct {
 	b2j            map[string][]int
 	IsJunk         func(string) bool
 	autoJunk       bool
-	bJunk          map[string]struct{}
+	bJunk          map[string]bool
 	matchingBlocks []Match
 	fullBCount     map[string]int
-	bPopular       map[string]struct{}
+	bPopular       map[string]bool
 	opCodes        []OpCode
 }
 
@@ -161,42 +161,31 @@ func (m *SequenceMatcher) SetSeq2(b []string) {
 func (m *SequenceMatcher) chainB() {
 	// Populate line -> index mapping
 	b2j := map[string][]int{}
-	for i, s := range m.b {
-		indices := b2j[s]
-		indices = append(indices, i)
-		b2j[s] = indices
+	junk := map[string]bool{}
+	popular := map[string]bool{}
+	ntest := len(m.b)
+	if m.autoJunk && ntest >= 200 {
+		ntest = ntest/100 + 1
 	}
-
-	// Purge junk elements
-	m.bJunk = map[string]struct{}{}
-	if m.IsJunk != nil {
-		junk := m.bJunk
-		for s, _ := range b2j {
-			if m.IsJunk(s) {
-				junk[s] = struct{}{}
+	for i, s := range m.b {
+		if !junk[s] {
+			if m.IsJunk != nil && m.IsJunk(s) {
+				junk[s] = true
+			} else if !popular[s] {
+				ids := append(b2j[s], i)
+				if len(ids) <= ntest {
+					b2j[s] = ids
+				} else {
+					delete(b2j, s)
+					popular[s] = true
+				}
 			}
 		}
-		for s, _ := range junk {
-			delete(b2j, s)
-		}
 	}
 
-	// Purge remaining popular elements
-	popular := map[string]struct{}{}
-	n := len(m.b)
-	if m.autoJunk && n >= 200 {
-		ntest := n/100 + 1
-		for s, indices := range b2j {
-			if len(indices) > ntest {
-				popular[s] = struct{}{}
-			}
-		}
-		for s, _ := range popular {
-			delete(b2j, s)
-		}
-	}
-	m.bPopular = popular
 	m.b2j = b2j
+	m.bJunk = junk
+	m.bPopular = popular
 }
 
 func (m *SequenceMatcher) isBJunk(s string) bool {
diff --git a/difflib/difflib_test.go b/difflib/difflib_test.go
@@ -248,18 +248,18 @@ func TestWithAsciiBJunk(t *testing.T) {
 	}
 	sm := NewMatcherWithJunk(splitChars(rep("a", 40)+rep("b", 40)),
 		splitChars(rep("a", 44)+rep("b", 40)), true, isJunk)
-	assertEqual(t, sm.bJunk, map[string]struct{}{})
+	assertEqual(t, sm.bJunk, map[string]bool{})
 
 	sm = NewMatcherWithJunk(splitChars(rep("a", 40)+rep("b", 40)),
 		splitChars(rep("a", 44)+rep("b", 40)+rep(" ", 20)), false, isJunk)
-	assertEqual(t, sm.bJunk, map[string]struct{}{" ": struct{}{}})
+	assertEqual(t, sm.bJunk, map[string]bool{" ": true})
 
 	isJunk = func(s string) bool {
 		return s == " " || s == "b"
 	}
 	sm = NewMatcherWithJunk(splitChars(rep("a", 40)+rep("b", 40)),
 		splitChars(rep("a", 44)+rep("b", 40)+rep(" ", 20)), false, isJunk)
-	assertEqual(t, sm.bJunk, map[string]struct{}{" ": struct{}{}, "b": struct{}{}})
+	assertEqual(t, sm.bJunk, map[string]bool{" ": true, "b": true})
 }
 
 func TestSFBugsRatioForNullSeqn(t *testing.T) {

Original file line number	Diff line number	Diff line change
`@@ -248,18 +248,18 @@ func TestWithAsciiBJunk(t *testing.T) {`
`248`	`248`	`}`
`249`	`249`	`sm := NewMatcherWithJunk(splitChars(rep("a", 40)+rep("b", 40)),`
`250`	`250`	`splitChars(rep("a", 44)+rep("b", 40)), true, isJunk)`
`251`		`- assertEqual(t, sm.bJunk, map[string]struct{}{})`
	`251`	`+ assertEqual(t, sm.bJunk, map[string]bool{})`
`252`	`252`
`253`	`253`	`sm = NewMatcherWithJunk(splitChars(rep("a", 40)+rep("b", 40)),`
`254`	`254`	`splitChars(rep("a", 44)+rep("b", 40)+rep(" ", 20)), false, isJunk)`
`255`		`- assertEqual(t, sm.bJunk, map[string]struct{}{" ": struct{}{}})`
	`255`	`+ assertEqual(t, sm.bJunk, map[string]bool{" ": true})`
`256`	`256`
`257`	`257`	`isJunk = func(s string) bool {`
`258`	`258`	`return s == " " \|\| s == "b"`
`259`	`259`	`}`
`260`	`260`	`sm = NewMatcherWithJunk(splitChars(rep("a", 40)+rep("b", 40)),`
`261`	`261`	`splitChars(rep("a", 44)+rep("b", 40)+rep(" ", 20)), false, isJunk)`
`262`		`- assertEqual(t, sm.bJunk, map[string]struct{}{" ": struct{}{}, "b": struct{}{}})`
	`262`	`+ assertEqual(t, sm.bJunk, map[string]bool{" ": true, "b": true})`
`263`	`263`	`}`
`264`	`264`
`265`	`265`	`func TestSFBugsRatioForNullSeqn(t *testing.T) {`