Skip to content

Commit

Permalink
went back and read the code, many little tidy-ups
Browse files Browse the repository at this point in the history
Signed-off-by: Tim Bray <tbray@textuality.com>
  • Loading branch information
timbray committed Apr 14, 2024
1 parent 2f265c6 commit 147a00e
Show file tree
Hide file tree
Showing 15 changed files with 233 additions and 219 deletions.
46 changes: 23 additions & 23 deletions internal/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,18 +8,18 @@ import (
"strings"
)

type Config struct {
Size int
Fields []uint
type config struct {
size int
fields []uint
Fname string
Filter Filters
Width int
Sample bool
filter filters
width int
sample bool
}

func Configure(args []string) (*Config, error) {
func Configure(args []string) (*config, error) {
// lifted out of main.go to facilitate testing
config := Config{Size: 10}
config := config{size: 10}
var err error

i := 0
Expand All @@ -31,41 +31,41 @@ func Configure(args []string) (*Config, error) {
err = errors.New("insufficient arguments for --number")
} else {
i++
config.Size, err = strconv.Atoi(args[i])
if err == nil && config.Size < 1 {
err = fmt.Errorf("invalid Size %d", config.Size)
config.size, err = strconv.Atoi(args[i])
if err == nil && config.size < 1 {
err = fmt.Errorf("invalid size %d", config.size)
}
}
case arg == "-f" || arg == "--fields":
if (i + 1) >= len(args) {
err = errors.New("insufficient arguments for --fields")
} else {
i++
config.Fields, err = parseFields(args[i])
config.fields, err = parseFields(args[i])
}
case arg == "-g" || arg == "--grep":
if (i + 1) >= len(args) {
err = errors.New("insufficient arguments for --grep")
} else {
i++
err = config.Filter.AddGrep(args[i])
err = config.filter.addGrep(args[i])
}
case arg == "-v" || arg == "--vgrep":
if (i + 1) >= len(args) {
err = errors.New("insufficient arguments for --vgrep")
} else {
i++
err = config.Filter.AddVgrep(args[i])
err = config.filter.addVgrep(args[i])
}
case arg == "-s" || arg == "--sed":
if (i + 2) >= len(args) {
err = errors.New("insufficient arguments for --sed")
} else {
err = config.Filter.AddSed(args[i+1], args[i+2])
err = config.filter.addSed(args[i+1], args[i+2])
i += 2
}
case arg == "--sample":
config.Sample = true
config.sample = true
case arg == "-h" || arg == "-help" || arg == "--help":
fmt.Println(instructions)
os.Exit(0)
Expand All @@ -74,9 +74,9 @@ func Configure(args []string) (*Config, error) {
err = errors.New("insufficient arguments for --width")
} else {
i++
config.Width, err = strconv.Atoi(args[i])
if err == nil && config.Width < 1 {
err = fmt.Errorf("invalid Width %d", config.Width)
config.width, err = strconv.Atoi(args[i])
if err == nil && config.width < 1 {
err = fmt.Errorf("invalid width %d", config.width)
}
}

Expand Down Expand Up @@ -134,15 +134,15 @@ Usage: tf
All the arguments are optional; if none are provided, tf will read records
from the standard input and list the 10 which occur most often.
Field list is comma-separated integers, e.g. -f 3 or --fields 1,3,7. The Fields
Field list is comma-separated integers, e.g. -f 3 or --fields 1,3,7. The fields
must be provided in order, so 3,1,7 is an error.
The regexp-valued Fields work as follows:
The regexp-valued fields work as follows:
-g/--grep discards records that don't match the regexp (g for grep)
-v/--vgrep discards records that do match the regexp (v for grep -v)
-s/--sed works on extracted Fields, replacing regexp with replacement
-s/--sed works on extracted fields, replacing regexp with replacement
The regexp-valued Fields can be supplied multiple times; the filtering
The regexp-valued fields can be supplied multiple times; the filtering
and substitution will be performed in the order supplied.
If the input is a named file, tf will process it in multiple parallel
Expand Down
2 changes: 1 addition & 1 deletion internal/config_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ func TestArgSyntax(t *testing.T) {

for _, good := range goods {
var err error
var c *Config
var c *config
c, err = Configure(good)
if err != nil || c == nil {
t.Error("rejected good argument: " + good[0])
Expand Down
64 changes: 39 additions & 25 deletions internal/counter.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,41 +4,54 @@ import (
"sort"
)

// KeyCount represents a key's occurrence count.
type KeyCount struct {
// keyCount represents a Key's occurrence count.
type keyCount struct {
Key string
Count *uint64
}

// Counter represents a bunch of keys and their occurrence counts, with the highest counts tracked.
// The core idea is that when you read a large number of field values and want to find the N values which
// occur most commonly, you keep a large table of the occurrence counts for each observed value and a small
// table of the top values/counts, and remember the occurrence threshold it takes to get into the top table.
// Then for each value, you increment its count and see if the new count gets it into the current top-values
// list, you add it if it's not already there. The top-values table will grow, so every so often you trim it
// back to size N. After a while, in a large dataset the overwhelming majority of values will either already
// be in the top-values table or not belong there, so that table's membership will be increasingly stable
// and require neither growing nor trimming. When you reach the end of the data, you sort the top-values
// table (trivial, because it's small) and return that. I haven't done a formal analysis but I'm pretty sure
// the computation trends to O(N) in the size of the number of records. Also it's "embarrassingly parallel"
// in the sense that *if* you can access the records in parallel you can do the top-values computation in as
// many parallel threads as the underlying computer can offer.

// counter represents a bunch of keys and their occurrence counts, with the highest counts tracked.
// threshold represents the minimum count value to qualify for consideration as a top count
// the "top" map represents the keys & counts encountered so far which are higher than threshold
// The hash values are pointers not integers for efficiency reasons, so you don't have to update the
// map[string] mapping, you just update the number the key maps to.
type Counter struct {
// map[string] mapping, you just update the number the Key maps to.
type counter struct {
counts map[string]*uint64
top map[string]*uint64
threshold uint64
size int
}

// NewCounter creates a new empty counter, ready for use. Size controls how many top items to track.
func NewCounter(size int) *Counter {
t := new(Counter)
// newCounter creates a new empty counter, ready for use. size controls how many top items to track.
func newCounter(size int) *counter {
t := new(counter)
t.size = size
t.counts = make(map[string]*uint64, 1024)
t.top = make(map[string]*uint64, size*2)
return t
}

// Add one occurrence to the counts for the indicated key.
func (t *Counter) Add(bytes []byte) {
// add one occurrence to the counts for the indicated Key.
func (t *counter) add(bytes []byte) {
// note the call with a byte slice rather than the string because of
// https://github.com/golang/go/commit/f5f5a8b6209f84961687d993b93ea0d397f5d5bf
// which recognizes the idiom foo[string(someByteSlice)] and bypasses constructing the string;
// of course we'd rather just say foo[someByteSlice] but that's not legal because Reasons.

// have we seen this key?
// have we seen this Key?
count, ok := t.counts[string(bytes)]
if !ok {
var one uint64 = 1
Expand All @@ -61,8 +74,8 @@ func (t *Counter) Add(bytes []byte) {
}
}

func (t *Counter) compact() {
// sort the top candidates, shrink the list to the top t.Size, put them back in a map
func (t *counter) compact() {
// sort the top candidates, shrink the list to the top t.size, put them back in a map
var topList = t.topAsSortedList()
topList = topList[0:t.size]
t.threshold = *(topList[len(topList)-1].Count)
Expand All @@ -72,31 +85,31 @@ func (t *Counter) compact() {
}
}

func (t *Counter) topAsSortedList() []*KeyCount {
topList := make([]*KeyCount, 0, len(t.top))
func (t *counter) topAsSortedList() []*keyCount {
topList := make([]*keyCount, 0, len(t.top))
for key, count := range t.top {
topList = append(topList, &KeyCount{key, count})
topList = append(topList, &keyCount{key, count})
}
sort.Slice(topList, func(k1, k2 int) bool {
return *topList[k1].Count > *topList[k2].Count
})
return topList
}

// GetTop returns the top occurring keys & counts in order of descending count
func (t *Counter) GetTop() []*KeyCount {
// getTop returns the top occurring keys & counts in order of descending count
func (t *counter) getTop() []*keyCount {
topList := t.topAsSortedList()
if len(topList) > t.size {
return topList[0:t.size]
}
return topList
}

// merge applies the counts from the SegmentCounter into the Counter.
// merge applies the counts from the SegmentCounter into the counter.
// Once merged, the SegmentCounter should be discarded.
func (t *Counter) merge(segCounter segmentCounter) {
func (t *counter) merge(segCounter segmentCounter) {
for segKey, segCount := range segCounter {
// Annoyingly we can't efficiently call Add here because we have
// Annoyingly we can't efficiently call add here because we have
// a string not a []byte
count, existingKey := t.counts[segKey]
if !existingKey {
Expand All @@ -110,10 +123,11 @@ func (t *Counter) merge(segCounter segmentCounter) {
if *count >= t.threshold {
// if it wasn't in t.counts then we already know it's not in
// t.top
var topKey bool
if existingKey {
_, existingKey = t.top[segKey]
_, topKey = t.top[segKey]
}
if !existingKey {
if !topKey {
t.top[segKey] = count
// has the top set grown enough to compress?
if len(t.top) >= (t.size * 2) {
Expand All @@ -124,14 +138,14 @@ func (t *Counter) merge(segCounter segmentCounter) {
}
}

// SegmentCounter tracks key occurrence counts for a single segment.
// SegmentCounter tracks Key occurrence counts for a single segment.
type segmentCounter map[string]*uint64

func newSegmentCounter() segmentCounter {
return make(segmentCounter, 1024)
}

func (s segmentCounter) Add(key []byte) {
func (s segmentCounter) add(key []byte) {
count, ok := s[string(key)]
if !ok {
var one uint64 = 1
Expand Down
48 changes: 24 additions & 24 deletions internal/counter_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,19 +14,19 @@ func Test1KLines(t *testing.T) {
}
//noinspection ALL
defer file.Close()
table := NewCounter(5)
table := newCounter(5)
re := regexp.MustCompile(`\s+`)

scanner := bufio.NewScanner(file)
for scanner.Scan() {
fields := re.Split(scanner.Text(), 2)
table.Add([]byte(fields[0]))
table.add([]byte(fields[0]))
}

if err := scanner.Err(); err != nil {
t.Error(err.Error())
}
x := table.GetTop()
x := table.getTop()

var wanted = map[string]int{
"96.48.229.116": 74,
Expand All @@ -42,13 +42,13 @@ func Test1KLines(t *testing.T) {

for _, kc := range x {
if *kc.Count != uint64(wanted[kc.Key]) {
t.Error("Wrong count for key: " + kc.Key)
t.Error("Wrong count for Key: " + kc.Key)
}
}
}

func TestTable_Add(t *testing.T) {
table := NewCounter(5)
table := newCounter(5)
keys := []string{
"a", "b", "c", "d", "e", "f", "g", "h",
"a", "b", "c", "d", "e", "f", "g",
Expand All @@ -59,74 +59,74 @@ func TestTable_Add(t *testing.T) {
"c", "g",
"c"}
for _, key := range keys {
table.Add([]byte(key))
table.add([]byte(key))
}
n4 := uint64(4)
n5 := uint64(5)
n6 := uint64(6)
n7 := uint64(7)
n8 := uint64(8)

wanted := []*KeyCount{
wanted := []*keyCount{
{"c", &n8},
{"g", &n7},
{"e", &n6},
{"f", &n5},
{"a", &n4},
}
assertKeyCountsEqual(t, wanted, table.GetTop())
assertKeyCountsEqual(t, wanted, table.getTop())

table = NewCounter(3)
table = newCounter(3)
for _, key := range keys {
table.Add([]byte(key))
table.add([]byte(key))
}
wanted = []*KeyCount{
wanted = []*keyCount{
{"c", &n8},
{"g", &n7},
{"e", &n6},
}
assertKeyCountsEqual(t, wanted, table.GetTop())
assertKeyCountsEqual(t, wanted, table.getTop())
}

func Test_newTable(t *testing.T) {
table := NewCounter(333)
top := table.GetTop()
table := newCounter(333)
top := table.getTop()
if len(top) != 0 {
t.Error("new table should be empty")
}
}

func Test_Merge(t *testing.T) {
a := NewCounter(10)
a := newCounter(10)
b := newSegmentCounter()
c := newSegmentCounter()
for i := 0; i < 50; i++ {
b.Add([]byte{byte('A')})
b.Add([]byte{byte('B')})
c.Add([]byte{byte('C')})
c.Add([]byte{byte('A')})
b.add([]byte{byte('A')})
b.add([]byte{byte('B')})
c.add([]byte{byte('C')})
c.add([]byte{byte('A')})
}
c.Add([]byte{byte('C')})
c.add([]byte{byte('C')})
a.merge(b)
a.merge(c)
exp := []*KeyCount{
exp := []*keyCount{
{"A", pv(100)}, {"C", pv(51)}, {"B", pv(50)},
}
assertKeyCountsEqual(t, exp, a.GetTop())
assertKeyCountsEqual(t, exp, a.getTop())
}

func pv(v uint64) *uint64 {
return &v
}

func assertKeyCountsEqual(t *testing.T, exp []*KeyCount, act []*KeyCount) {
func assertKeyCountsEqual(t *testing.T, exp []*keyCount, act []*keyCount) {
t.Helper()
if len(exp) != len(act) {
t.Errorf("Expecting %d results, but got %d", len(exp), len(act))
}
for i := 0; i < min(len(exp), len(act)); i++ {
if exp[i].Key != act[i].Key {
t.Errorf("Unexpected key %v at index %d, expecting %v", act[i].Key, i, exp[i].Key)
t.Errorf("Unexpected Key %v at index %d, expecting %v", act[i].Key, i, exp[i].Key)
}
if *exp[i].Count != *act[i].Count {
t.Errorf("Unexpected count of %d at index %d, expecting %d", *act[i].Count, i, *exp[i].Count)
Expand Down
Loading

0 comments on commit 147a00e

Please sign in to comment.