went back and read the code, many little tidy-ups

Signed-off-by: Tim Bray <tbray@textuality.com>
timbray · Apr 14, 2024 · 147a00e · 147a00e
1 parent 2f265c6
commit 147a00e
Show file tree

Hide file tree

Showing 15 changed files with 233 additions and 219 deletions.
diff --git a/internal/config.go b/internal/config.go
@@ -8,18 +8,18 @@ import (
 	"strings"
 )
 
-type Config struct {
-	Size   int
-	Fields []uint
+type config struct {
+	size   int
+	fields []uint
 	Fname  string
-	Filter Filters
-	Width  int
-	Sample bool
+	filter filters
+	width  int
+	sample bool
 }
 
-func Configure(args []string) (*Config, error) {
+func Configure(args []string) (*config, error) {
 	// lifted out of main.go to facilitate testing
-	config := Config{Size: 10}
+	config := config{size: 10}
 	var err error
 
 	i := 0
@@ -31,41 +31,41 @@ func Configure(args []string) (*Config, error) {
 				err = errors.New("insufficient arguments for --number")
 			} else {
 				i++
-				config.Size, err = strconv.Atoi(args[i])
-				if err == nil && config.Size < 1 {
-					err = fmt.Errorf("invalid Size %d", config.Size)
+				config.size, err = strconv.Atoi(args[i])
+				if err == nil && config.size < 1 {
+					err = fmt.Errorf("invalid size %d", config.size)
 				}
 			}
 		case arg == "-f" || arg == "--fields":
 			if (i + 1) >= len(args) {
 				err = errors.New("insufficient arguments for --fields")
 			} else {
 				i++
-				config.Fields, err = parseFields(args[i])
+				config.fields, err = parseFields(args[i])
 			}
 		case arg == "-g" || arg == "--grep":
 			if (i + 1) >= len(args) {
 				err = errors.New("insufficient arguments for --grep")
 			} else {
 				i++
-				err = config.Filter.AddGrep(args[i])
+				err = config.filter.addGrep(args[i])
 			}
 		case arg == "-v" || arg == "--vgrep":
 			if (i + 1) >= len(args) {
 				err = errors.New("insufficient arguments for --vgrep")
 			} else {
 				i++
-				err = config.Filter.AddVgrep(args[i])
+				err = config.filter.addVgrep(args[i])
 			}
 		case arg == "-s" || arg == "--sed":
 			if (i + 2) >= len(args) {
 				err = errors.New("insufficient arguments for --sed")
 			} else {
-				err = config.Filter.AddSed(args[i+1], args[i+2])
+				err = config.filter.addSed(args[i+1], args[i+2])
 				i += 2
 			}
 		case arg == "--sample":
-			config.Sample = true
+			config.sample = true
 		case arg == "-h" || arg == "-help" || arg == "--help":
 			fmt.Println(instructions)
 			os.Exit(0)
@@ -74,9 +74,9 @@ func Configure(args []string) (*Config, error) {
 				err = errors.New("insufficient arguments for --width")
 			} else {
 				i++
-				config.Width, err = strconv.Atoi(args[i])
-				if err == nil && config.Width < 1 {
-					err = fmt.Errorf("invalid Width %d", config.Width)
+				config.width, err = strconv.Atoi(args[i])
+				if err == nil && config.width < 1 {
+					err = fmt.Errorf("invalid width %d", config.width)
 				}
 			}
 
@@ -134,15 +134,15 @@ Usage: tf
 All the arguments are optional; if none are provided, tf will read records 
 from the standard input and list the 10 which occur most often.
 
-Field list is comma-separated integers, e.g. -f 3 or --fields 1,3,7. The Fields
+Field list is comma-separated integers, e.g. -f 3 or --fields 1,3,7. The fields
 must be provided in order, so 3,1,7 is an error.
 
-The regexp-valued Fields work as follows:
+The regexp-valued fields work as follows:
 -g/--grep discards records that don't match the regexp (g for grep)
 -v/--vgrep discards records that do match the regexp (v for grep -v)
--s/--sed works on extracted Fields, replacing regexp with replacement
+-s/--sed works on extracted fields, replacing regexp with replacement
 
-The regexp-valued Fields can be supplied multiple times; the filtering
+The regexp-valued fields can be supplied multiple times; the filtering
 and substitution will be performed in the order supplied.
 
 If the input is a named file, tf will process it in multiple parallel

diff --git a/internal/config_test.go b/internal/config_test.go
@@ -38,7 +38,7 @@ func TestArgSyntax(t *testing.T) {
 
 	for _, good := range goods {
 		var err error
-		var c *Config
+		var c *config
 		c, err = Configure(good)
 		if err != nil || c == nil {
 			t.Error("rejected good argument: " + good[0])

diff --git a/internal/counter.go b/internal/counter.go
@@ -4,41 +4,54 @@ import (
 	"sort"
 )
 
-// KeyCount represents a key's occurrence count.
-type KeyCount struct {
+// keyCount represents a Key's occurrence count.
+type keyCount struct {
 	Key   string
 	Count *uint64
 }
 
-// Counter represents a bunch of keys and their occurrence counts, with the highest counts tracked.
+// The core idea is that when you read a large number of field values and want to find the N values which
+// occur most commonly, you keep a large table of the occurrence counts for each observed value and a small
+// table of the top values/counts, and remember the occurrence threshold it takes to get into the top table.
+// Then for each value, you increment its count and see if the new count gets it into the current top-values
+// list, you add it if it's not already there. The top-values table will grow, so every so often you trim it
+// back to size N. After a while, in a large dataset the overwhelming majority of values will either already
+// be in the top-values table or not belong there, so that table's membership will be increasingly stable
+// and require neither growing nor trimming. When you reach the end of the data, you sort the top-values
+// table (trivial, because it's small) and return that. I haven't done a formal analysis but I'm pretty sure
+// the computation trends to O(N) in the size of the number of records. Also it's "embarrassingly parallel"
+// in the sense that *if* you can access the records in parallel you can do the top-values computation in as
+// many parallel threads as the underlying computer can offer.
+
+// counter represents a bunch of keys and their occurrence counts, with the highest counts tracked.
 // threshold represents the minimum count value to qualify for consideration as a top count
 // the "top" map represents the keys & counts encountered so far which are higher than threshold
 // The hash values are pointers not integers for efficiency reasons, so you don't have to update the
-// map[string] mapping, you just update the number the key maps to.
-type Counter struct {
+// map[string] mapping, you just update the number the Key maps to.
+type counter struct {
 	counts    map[string]*uint64
 	top       map[string]*uint64
 	threshold uint64
 	size      int
 }
 
-// NewCounter creates a new empty counter, ready for use. Size controls how many top items to track.
-func NewCounter(size int) *Counter {
-	t := new(Counter)
+// newCounter creates a new empty counter, ready for use. size controls how many top items to track.
+func newCounter(size int) *counter {
+	t := new(counter)
 	t.size = size
 	t.counts = make(map[string]*uint64, 1024)
 	t.top = make(map[string]*uint64, size*2)
 	return t
 }
 
-// Add one occurrence to the counts for the indicated key.
-func (t *Counter) Add(bytes []byte) {
+// add one occurrence to the counts for the indicated Key.
+func (t *counter) add(bytes []byte) {
 	// note the call with a byte slice rather than the string because of
 	//  https://github.com/golang/go/commit/f5f5a8b6209f84961687d993b93ea0d397f5d5bf
 	//  which recognizes the idiom foo[string(someByteSlice)] and bypasses constructing the string;
 	//  of course we'd rather just say foo[someByteSlice] but that's not legal because Reasons.
 
-	// have we seen this key?
+	// have we seen this Key?
 	count, ok := t.counts[string(bytes)]
 	if !ok {
 		var one uint64 = 1
@@ -61,8 +74,8 @@ func (t *Counter) Add(bytes []byte) {
 	}
 }
 
-func (t *Counter) compact() {
-	// sort the top candidates, shrink the list to the top t.Size, put them back in a map
+func (t *counter) compact() {
+	// sort the top candidates, shrink the list to the top t.size, put them back in a map
 	var topList = t.topAsSortedList()
 	topList = topList[0:t.size]
 	t.threshold = *(topList[len(topList)-1].Count)
@@ -72,31 +85,31 @@ func (t *Counter) compact() {
 	}
 }
 
-func (t *Counter) topAsSortedList() []*KeyCount {
-	topList := make([]*KeyCount, 0, len(t.top))
+func (t *counter) topAsSortedList() []*keyCount {
+	topList := make([]*keyCount, 0, len(t.top))
 	for key, count := range t.top {
-		topList = append(topList, &KeyCount{key, count})
+		topList = append(topList, &keyCount{key, count})
 	}
 	sort.Slice(topList, func(k1, k2 int) bool {
 		return *topList[k1].Count > *topList[k2].Count
 	})
 	return topList
 }
 
-// GetTop returns the top occurring keys & counts in order of descending count
-func (t *Counter) GetTop() []*KeyCount {
+// getTop returns the top occurring keys & counts in order of descending count
+func (t *counter) getTop() []*keyCount {
 	topList := t.topAsSortedList()
 	if len(topList) > t.size {
 		return topList[0:t.size]
 	}
 	return topList
 }
 
-// merge applies the counts from the SegmentCounter into the Counter.
+// merge applies the counts from the SegmentCounter into the counter.
 // Once merged, the SegmentCounter should be discarded.
-func (t *Counter) merge(segCounter segmentCounter) {
+func (t *counter) merge(segCounter segmentCounter) {
 	for segKey, segCount := range segCounter {
-		// Annoyingly we can't efficiently call Add here because we have
+		// Annoyingly we can't efficiently call add here because we have
 		// a string not a []byte
 		count, existingKey := t.counts[segKey]
 		if !existingKey {
@@ -110,10 +123,11 @@ func (t *Counter) merge(segCounter segmentCounter) {
 		if *count >= t.threshold {
 			// if it wasn't in t.counts then we already know it's not in
 			// t.top
+			var topKey bool
 			if existingKey {
-				_, existingKey = t.top[segKey]
+				_, topKey = t.top[segKey]
 			}
-			if !existingKey {
+			if !topKey {
 				t.top[segKey] = count
 				// has the top set grown enough to compress?
 				if len(t.top) >= (t.size * 2) {
@@ -124,14 +138,14 @@ func (t *Counter) merge(segCounter segmentCounter) {
 	}
 }
 
-// SegmentCounter tracks key occurrence counts for a single segment.
+// SegmentCounter tracks Key occurrence counts for a single segment.
 type segmentCounter map[string]*uint64
 
 func newSegmentCounter() segmentCounter {
 	return make(segmentCounter, 1024)
 }
 
-func (s segmentCounter) Add(key []byte) {
+func (s segmentCounter) add(key []byte) {
 	count, ok := s[string(key)]
 	if !ok {
 		var one uint64 = 1

diff --git a/internal/counter_test.go b/internal/counter_test.go
@@ -14,19 +14,19 @@ func Test1KLines(t *testing.T) {
 	}
 	//noinspection ALL
 	defer file.Close()
-	table := NewCounter(5)
+	table := newCounter(5)
 	re := regexp.MustCompile(`\s+`)
 
 	scanner := bufio.NewScanner(file)
 	for scanner.Scan() {
 		fields := re.Split(scanner.Text(), 2)
-		table.Add([]byte(fields[0]))
+		table.add([]byte(fields[0]))
 	}
 
 	if err := scanner.Err(); err != nil {
 		t.Error(err.Error())
 	}
-	x := table.GetTop()
+	x := table.getTop()
 
 	var wanted = map[string]int{
 		"96.48.229.116":   74,
@@ -42,13 +42,13 @@ func Test1KLines(t *testing.T) {
 
 	for _, kc := range x {
 		if *kc.Count != uint64(wanted[kc.Key]) {
-			t.Error("Wrong count for key: " + kc.Key)
+			t.Error("Wrong count for Key: " + kc.Key)
 		}
 	}
 }
 
 func TestTable_Add(t *testing.T) {
-	table := NewCounter(5)
+	table := newCounter(5)
 	keys := []string{
 		"a", "b", "c", "d", "e", "f", "g", "h",
 		"a", "b", "c", "d", "e", "f", "g",
@@ -59,74 +59,74 @@ func TestTable_Add(t *testing.T) {
 		"c", "g",
 		"c"}
 	for _, key := range keys {
-		table.Add([]byte(key))
+		table.add([]byte(key))
 	}
 	n4 := uint64(4)
 	n5 := uint64(5)
 	n6 := uint64(6)
 	n7 := uint64(7)
 	n8 := uint64(8)
 
-	wanted := []*KeyCount{
+	wanted := []*keyCount{
 		{"c", &n8},
 		{"g", &n7},
 		{"e", &n6},
 		{"f", &n5},
 		{"a", &n4},
 	}
-	assertKeyCountsEqual(t, wanted, table.GetTop())
+	assertKeyCountsEqual(t, wanted, table.getTop())
 
-	table = NewCounter(3)
+	table = newCounter(3)
 	for _, key := range keys {
-		table.Add([]byte(key))
+		table.add([]byte(key))
 	}
-	wanted = []*KeyCount{
+	wanted = []*keyCount{
 		{"c", &n8},
 		{"g", &n7},
 		{"e", &n6},
 	}
-	assertKeyCountsEqual(t, wanted, table.GetTop())
+	assertKeyCountsEqual(t, wanted, table.getTop())
 }
 
 func Test_newTable(t *testing.T) {
-	table := NewCounter(333)
-	top := table.GetTop()
+	table := newCounter(333)
+	top := table.getTop()
 	if len(top) != 0 {
 		t.Error("new table should be empty")
 	}
 }
 
 func Test_Merge(t *testing.T) {
-	a := NewCounter(10)
+	a := newCounter(10)
 	b := newSegmentCounter()
 	c := newSegmentCounter()
 	for i := 0; i < 50; i++ {
-		b.Add([]byte{byte('A')})
-		b.Add([]byte{byte('B')})
-		c.Add([]byte{byte('C')})
-		c.Add([]byte{byte('A')})
+		b.add([]byte{byte('A')})
+		b.add([]byte{byte('B')})
+		c.add([]byte{byte('C')})
+		c.add([]byte{byte('A')})
 	}
-	c.Add([]byte{byte('C')})
+	c.add([]byte{byte('C')})
 	a.merge(b)
 	a.merge(c)
-	exp := []*KeyCount{
+	exp := []*keyCount{
 		{"A", pv(100)}, {"C", pv(51)}, {"B", pv(50)},
 	}
-	assertKeyCountsEqual(t, exp, a.GetTop())
+	assertKeyCountsEqual(t, exp, a.getTop())
 }
 
 func pv(v uint64) *uint64 {
 	return &v
 }
 
-func assertKeyCountsEqual(t *testing.T, exp []*KeyCount, act []*KeyCount) {
+func assertKeyCountsEqual(t *testing.T, exp []*keyCount, act []*keyCount) {
 	t.Helper()
 	if len(exp) != len(act) {
 		t.Errorf("Expecting %d results, but got %d", len(exp), len(act))
 	}
 	for i := 0; i < min(len(exp), len(act)); i++ {
 		if exp[i].Key != act[i].Key {
-			t.Errorf("Unexpected key %v at index %d, expecting %v", act[i].Key, i, exp[i].Key)
+			t.Errorf("Unexpected Key %v at index %d, expecting %v", act[i].Key, i, exp[i].Key)
 		}
 		if *exp[i].Count != *act[i].Count {
 			t.Errorf("Unexpected count of %d at index %d, expecting %d", *act[i].Count, i, *exp[i].Count)