diff --git a/internal/config.go b/internal/config.go index 7e144c4..31fb49d 100644 --- a/internal/config.go +++ b/internal/config.go @@ -8,18 +8,18 @@ import ( "strings" ) -type Config struct { - Size int - Fields []uint +type config struct { + size int + fields []uint Fname string - Filter Filters - Width int - Sample bool + filter filters + width int + sample bool } -func Configure(args []string) (*Config, error) { +func Configure(args []string) (*config, error) { // lifted out of main.go to facilitate testing - config := Config{Size: 10} + config := config{size: 10} var err error i := 0 @@ -31,9 +31,9 @@ func Configure(args []string) (*Config, error) { err = errors.New("insufficient arguments for --number") } else { i++ - config.Size, err = strconv.Atoi(args[i]) - if err == nil && config.Size < 1 { - err = fmt.Errorf("invalid Size %d", config.Size) + config.size, err = strconv.Atoi(args[i]) + if err == nil && config.size < 1 { + err = fmt.Errorf("invalid size %d", config.size) } } case arg == "-f" || arg == "--fields": @@ -41,31 +41,31 @@ func Configure(args []string) (*Config, error) { err = errors.New("insufficient arguments for --fields") } else { i++ - config.Fields, err = parseFields(args[i]) + config.fields, err = parseFields(args[i]) } case arg == "-g" || arg == "--grep": if (i + 1) >= len(args) { err = errors.New("insufficient arguments for --grep") } else { i++ - err = config.Filter.AddGrep(args[i]) + err = config.filter.addGrep(args[i]) } case arg == "-v" || arg == "--vgrep": if (i + 1) >= len(args) { err = errors.New("insufficient arguments for --vgrep") } else { i++ - err = config.Filter.AddVgrep(args[i]) + err = config.filter.addVgrep(args[i]) } case arg == "-s" || arg == "--sed": if (i + 2) >= len(args) { err = errors.New("insufficient arguments for --sed") } else { - err = config.Filter.AddSed(args[i+1], args[i+2]) + err = config.filter.addSed(args[i+1], args[i+2]) i += 2 } case arg == "--sample": - config.Sample = true + config.sample = true case arg == "-h" || arg == "-help" || arg == "--help": fmt.Println(instructions) os.Exit(0) @@ -74,9 +74,9 @@ func Configure(args []string) (*Config, error) { err = errors.New("insufficient arguments for --width") } else { i++ - config.Width, err = strconv.Atoi(args[i]) - if err == nil && config.Width < 1 { - err = fmt.Errorf("invalid Width %d", config.Width) + config.width, err = strconv.Atoi(args[i]) + if err == nil && config.width < 1 { + err = fmt.Errorf("invalid width %d", config.width) } } @@ -134,15 +134,15 @@ Usage: tf All the arguments are optional; if none are provided, tf will read records from the standard input and list the 10 which occur most often. -Field list is comma-separated integers, e.g. -f 3 or --fields 1,3,7. The Fields +Field list is comma-separated integers, e.g. -f 3 or --fields 1,3,7. The fields must be provided in order, so 3,1,7 is an error. -The regexp-valued Fields work as follows: +The regexp-valued fields work as follows: -g/--grep discards records that don't match the regexp (g for grep) -v/--vgrep discards records that do match the regexp (v for grep -v) --s/--sed works on extracted Fields, replacing regexp with replacement +-s/--sed works on extracted fields, replacing regexp with replacement -The regexp-valued Fields can be supplied multiple times; the filtering +The regexp-valued fields can be supplied multiple times; the filtering and substitution will be performed in the order supplied. If the input is a named file, tf will process it in multiple parallel diff --git a/internal/config_test.go b/internal/config_test.go index 16c44db..0eaeabc 100644 --- a/internal/config_test.go +++ b/internal/config_test.go @@ -38,7 +38,7 @@ func TestArgSyntax(t *testing.T) { for _, good := range goods { var err error - var c *Config + var c *config c, err = Configure(good) if err != nil || c == nil { t.Error("rejected good argument: " + good[0]) diff --git a/internal/counter.go b/internal/counter.go index 25243b7..3ca5a6c 100644 --- a/internal/counter.go +++ b/internal/counter.go @@ -4,41 +4,54 @@ import ( "sort" ) -// KeyCount represents a key's occurrence count. -type KeyCount struct { +// keyCount represents a Key's occurrence count. +type keyCount struct { Key string Count *uint64 } -// Counter represents a bunch of keys and their occurrence counts, with the highest counts tracked. +// The core idea is that when you read a large number of field values and want to find the N values which +// occur most commonly, you keep a large table of the occurrence counts for each observed value and a small +// table of the top values/counts, and remember the occurrence threshold it takes to get into the top table. +// Then for each value, you increment its count and see if the new count gets it into the current top-values +// list, you add it if it's not already there. The top-values table will grow, so every so often you trim it +// back to size N. After a while, in a large dataset the overwhelming majority of values will either already +// be in the top-values table or not belong there, so that table's membership will be increasingly stable +// and require neither growing nor trimming. When you reach the end of the data, you sort the top-values +// table (trivial, because it's small) and return that. I haven't done a formal analysis but I'm pretty sure +// the computation trends to O(N) in the size of the number of records. Also it's "embarrassingly parallel" +// in the sense that *if* you can access the records in parallel you can do the top-values computation in as +// many parallel threads as the underlying computer can offer. + +// counter represents a bunch of keys and their occurrence counts, with the highest counts tracked. // threshold represents the minimum count value to qualify for consideration as a top count // the "top" map represents the keys & counts encountered so far which are higher than threshold // The hash values are pointers not integers for efficiency reasons, so you don't have to update the -// map[string] mapping, you just update the number the key maps to. -type Counter struct { +// map[string] mapping, you just update the number the Key maps to. +type counter struct { counts map[string]*uint64 top map[string]*uint64 threshold uint64 size int } -// NewCounter creates a new empty counter, ready for use. Size controls how many top items to track. -func NewCounter(size int) *Counter { - t := new(Counter) +// newCounter creates a new empty counter, ready for use. size controls how many top items to track. +func newCounter(size int) *counter { + t := new(counter) t.size = size t.counts = make(map[string]*uint64, 1024) t.top = make(map[string]*uint64, size*2) return t } -// Add one occurrence to the counts for the indicated key. -func (t *Counter) Add(bytes []byte) { +// add one occurrence to the counts for the indicated Key. +func (t *counter) add(bytes []byte) { // note the call with a byte slice rather than the string because of // https://github.com/golang/go/commit/f5f5a8b6209f84961687d993b93ea0d397f5d5bf // which recognizes the idiom foo[string(someByteSlice)] and bypasses constructing the string; // of course we'd rather just say foo[someByteSlice] but that's not legal because Reasons. - // have we seen this key? + // have we seen this Key? count, ok := t.counts[string(bytes)] if !ok { var one uint64 = 1 @@ -61,8 +74,8 @@ func (t *Counter) Add(bytes []byte) { } } -func (t *Counter) compact() { - // sort the top candidates, shrink the list to the top t.Size, put them back in a map +func (t *counter) compact() { + // sort the top candidates, shrink the list to the top t.size, put them back in a map var topList = t.topAsSortedList() topList = topList[0:t.size] t.threshold = *(topList[len(topList)-1].Count) @@ -72,10 +85,10 @@ func (t *Counter) compact() { } } -func (t *Counter) topAsSortedList() []*KeyCount { - topList := make([]*KeyCount, 0, len(t.top)) +func (t *counter) topAsSortedList() []*keyCount { + topList := make([]*keyCount, 0, len(t.top)) for key, count := range t.top { - topList = append(topList, &KeyCount{key, count}) + topList = append(topList, &keyCount{key, count}) } sort.Slice(topList, func(k1, k2 int) bool { return *topList[k1].Count > *topList[k2].Count @@ -83,8 +96,8 @@ func (t *Counter) topAsSortedList() []*KeyCount { return topList } -// GetTop returns the top occurring keys & counts in order of descending count -func (t *Counter) GetTop() []*KeyCount { +// getTop returns the top occurring keys & counts in order of descending count +func (t *counter) getTop() []*keyCount { topList := t.topAsSortedList() if len(topList) > t.size { return topList[0:t.size] @@ -92,11 +105,11 @@ func (t *Counter) GetTop() []*KeyCount { return topList } -// merge applies the counts from the SegmentCounter into the Counter. +// merge applies the counts from the SegmentCounter into the counter. // Once merged, the SegmentCounter should be discarded. -func (t *Counter) merge(segCounter segmentCounter) { +func (t *counter) merge(segCounter segmentCounter) { for segKey, segCount := range segCounter { - // Annoyingly we can't efficiently call Add here because we have + // Annoyingly we can't efficiently call add here because we have // a string not a []byte count, existingKey := t.counts[segKey] if !existingKey { @@ -110,10 +123,11 @@ func (t *Counter) merge(segCounter segmentCounter) { if *count >= t.threshold { // if it wasn't in t.counts then we already know it's not in // t.top + var topKey bool if existingKey { - _, existingKey = t.top[segKey] + _, topKey = t.top[segKey] } - if !existingKey { + if !topKey { t.top[segKey] = count // has the top set grown enough to compress? if len(t.top) >= (t.size * 2) { @@ -124,14 +138,14 @@ func (t *Counter) merge(segCounter segmentCounter) { } } -// SegmentCounter tracks key occurrence counts for a single segment. +// SegmentCounter tracks Key occurrence counts for a single segment. type segmentCounter map[string]*uint64 func newSegmentCounter() segmentCounter { return make(segmentCounter, 1024) } -func (s segmentCounter) Add(key []byte) { +func (s segmentCounter) add(key []byte) { count, ok := s[string(key)] if !ok { var one uint64 = 1 diff --git a/internal/counter_test.go b/internal/counter_test.go index 6f784a5..217d79b 100644 --- a/internal/counter_test.go +++ b/internal/counter_test.go @@ -14,19 +14,19 @@ func Test1KLines(t *testing.T) { } //noinspection ALL defer file.Close() - table := NewCounter(5) + table := newCounter(5) re := regexp.MustCompile(`\s+`) scanner := bufio.NewScanner(file) for scanner.Scan() { fields := re.Split(scanner.Text(), 2) - table.Add([]byte(fields[0])) + table.add([]byte(fields[0])) } if err := scanner.Err(); err != nil { t.Error(err.Error()) } - x := table.GetTop() + x := table.getTop() var wanted = map[string]int{ "96.48.229.116": 74, @@ -42,13 +42,13 @@ func Test1KLines(t *testing.T) { for _, kc := range x { if *kc.Count != uint64(wanted[kc.Key]) { - t.Error("Wrong count for key: " + kc.Key) + t.Error("Wrong count for Key: " + kc.Key) } } } func TestTable_Add(t *testing.T) { - table := NewCounter(5) + table := newCounter(5) keys := []string{ "a", "b", "c", "d", "e", "f", "g", "h", "a", "b", "c", "d", "e", "f", "g", @@ -59,7 +59,7 @@ func TestTable_Add(t *testing.T) { "c", "g", "c"} for _, key := range keys { - table.Add([]byte(key)) + table.add([]byte(key)) } n4 := uint64(4) n5 := uint64(5) @@ -67,66 +67,66 @@ func TestTable_Add(t *testing.T) { n7 := uint64(7) n8 := uint64(8) - wanted := []*KeyCount{ + wanted := []*keyCount{ {"c", &n8}, {"g", &n7}, {"e", &n6}, {"f", &n5}, {"a", &n4}, } - assertKeyCountsEqual(t, wanted, table.GetTop()) + assertKeyCountsEqual(t, wanted, table.getTop()) - table = NewCounter(3) + table = newCounter(3) for _, key := range keys { - table.Add([]byte(key)) + table.add([]byte(key)) } - wanted = []*KeyCount{ + wanted = []*keyCount{ {"c", &n8}, {"g", &n7}, {"e", &n6}, } - assertKeyCountsEqual(t, wanted, table.GetTop()) + assertKeyCountsEqual(t, wanted, table.getTop()) } func Test_newTable(t *testing.T) { - table := NewCounter(333) - top := table.GetTop() + table := newCounter(333) + top := table.getTop() if len(top) != 0 { t.Error("new table should be empty") } } func Test_Merge(t *testing.T) { - a := NewCounter(10) + a := newCounter(10) b := newSegmentCounter() c := newSegmentCounter() for i := 0; i < 50; i++ { - b.Add([]byte{byte('A')}) - b.Add([]byte{byte('B')}) - c.Add([]byte{byte('C')}) - c.Add([]byte{byte('A')}) + b.add([]byte{byte('A')}) + b.add([]byte{byte('B')}) + c.add([]byte{byte('C')}) + c.add([]byte{byte('A')}) } - c.Add([]byte{byte('C')}) + c.add([]byte{byte('C')}) a.merge(b) a.merge(c) - exp := []*KeyCount{ + exp := []*keyCount{ {"A", pv(100)}, {"C", pv(51)}, {"B", pv(50)}, } - assertKeyCountsEqual(t, exp, a.GetTop()) + assertKeyCountsEqual(t, exp, a.getTop()) } func pv(v uint64) *uint64 { return &v } -func assertKeyCountsEqual(t *testing.T, exp []*KeyCount, act []*KeyCount) { +func assertKeyCountsEqual(t *testing.T, exp []*keyCount, act []*keyCount) { t.Helper() if len(exp) != len(act) { t.Errorf("Expecting %d results, but got %d", len(exp), len(act)) } for i := 0; i < min(len(exp), len(act)); i++ { if exp[i].Key != act[i].Key { - t.Errorf("Unexpected key %v at index %d, expecting %v", act[i].Key, i, exp[i].Key) + t.Errorf("Unexpected Key %v at index %d, expecting %v", act[i].Key, i, exp[i].Key) } if *exp[i].Count != *act[i].Count { t.Errorf("Unexpected count of %d at index %d, expecting %d", *act[i].Count, i, *exp[i].Count) diff --git a/internal/filters.go b/internal/filters.go index d76660b..993d67d 100644 --- a/internal/filters.go +++ b/internal/filters.go @@ -4,60 +4,60 @@ import ( "regexp" ) -// Sed represents a sed(1) s/a/b/g operation. -type Sed struct { +// sed represents a sed(1) s/a/b/g operation. +type sed struct { ReplaceThis *regexp.Regexp WithThat []byte } -// Filters contains the filters to be applied prior to top-few computation. -type Filters struct { - Greps []*regexp.Regexp - VGreps []*regexp.Regexp - Seds []*Sed +// filters contains the filters to be applied prior to top-few computation. +type filters struct { + greps []*regexp.Regexp + vgreps []*regexp.Regexp + seds []*sed } -// AddSed appends a new Sed operation to the filters. -func (f *Filters) AddSed(replaceThis string, withThat string) error { +// addSed appends a new sed operation to the filters. +func (f *filters) addSed(replaceThis string, withThat string) error { re, err := regexp.Compile(replaceThis) if err == nil { - f.Seds = append(f.Seds, &Sed{re, []byte(withThat)}) + f.seds = append(f.seds, &sed{re, []byte(withThat)}) } return err } -// AddGrep appends a new grep/regex to the filters. Only items that match +// addGrep appends a new grep/regex to the filters. Only items that match // this regex will be counted. -func (f *Filters) AddGrep(s string) error { +func (f *filters) addGrep(s string) error { re, err := regexp.Compile(s) if err == nil { - f.Greps = append(f.Greps, re) + f.greps = append(f.greps, re) } return err } -// AddVgrep appends a new inverse grep/regex to the filters (ala grep -v). +// addVgrep appends a new inverse grep/regex to the filters (ala grep -v). // Only items that don't match the regex will be counted. -func (f *Filters) AddVgrep(s string) error { +func (f *filters) addVgrep(s string) error { re, err := regexp.Compile(s) if err == nil { - f.VGreps = append(f.VGreps, re) + f.vgreps = append(f.vgreps, re) } return err } -// FilterRecord returns true if the supplied record passes all the Filter +// filterRecord returns true if the supplied record passes all the filter // criteria. -func (f *Filters) FilterRecord(bytes []byte) bool { - if f.Greps == nil && f.VGreps == nil { +func (f *filters) filterRecord(bytes []byte) bool { + if f.greps == nil && f.vgreps == nil { return true } - for _, re := range f.Greps { + for _, re := range f.greps { if !re.Match(bytes) { return false } } - for _, re := range f.VGreps { + for _, re := range f.vgreps { if re.Match(bytes) { return false } @@ -65,9 +65,9 @@ func (f *Filters) FilterRecord(bytes []byte) bool { return true } -// FilterField returns a key that has had all the sed operations applied to it. -func (f *Filters) FilterField(bytes []byte) []byte { - for _, sed := range f.Seds { +// filterField returns a Key that has had all the sed operations applied to it. +func (f *filters) filterField(bytes []byte) []byte { + for _, sed := range f.seds { bytes = sed.ReplaceThis.ReplaceAll(bytes, sed.WithThat) } return bytes diff --git a/internal/filters_test.go b/internal/filters_test.go index 681320c..d55cd70 100644 --- a/internal/filters_test.go +++ b/internal/filters_test.go @@ -15,18 +15,18 @@ func TestSeds(t *testing.T) { /*6*/ "172.124.211.165 - - [04/May/2020:06:47:40 -0700] \"GET /ongoing/serif.css HTTP/1.1\" 200 2177 \"https://www.tbray.org/ongoing/When/202x/2020/04/29/Leaving-Amazon\" \"Mozilla/5.0 (Macintosh; Intel Mac OS X 10.14; rv:74.0) Gecko/20100101 Firefox/74.0\"\n", } - var filter Filters - err := filter.AddSed("^.*\\[04/May/2020:", "") + var filter filters + err := filter.addSed("^.*\\[04/May/2020:", "") if err != nil { t.Error("remove prefix" + err.Error()) } - err = filter.AddSed(" .*\n", "") + err = filter.addSed(" .*\n", "") if err != nil { t.Error("isolate time " + err.Error()) } wanted := []string{"06:36:20", "06:47:04", "06:47:09", "06:47:12", "06:47:14", "06:47:32", "06:47:40"} for i, line := range lines { - got := string(filter.FilterField([]byte(line))) + got := string(filter.filterField([]byte(line))) if got != wanted[i] { t.Errorf("Wanted [%s], got [%s]", wanted[i], got) } @@ -45,7 +45,7 @@ func TestFilterCombos(t *testing.T) { } /* - Fields := []string{ + fields := []string{ "foo", // 0 "bar", // 1 "donkey", // 2 @@ -59,14 +59,14 @@ func TestFilterCombos(t *testing.T) { wantCSS := "\"GET \\S+\\.css " var err error - var recordFilter Filters - err = recordFilter.AddGrep(wantCSS) + var recordFilter filters + err = recordFilter.addGrep(wantCSS) if err != nil { - t.Error("AddGrep " + err.Error()) + t.Error("addGrep " + err.Error()) } for i, line := range lines { - matched := recordFilter.FilterRecord([]byte(line)) + matched := recordFilter.filterRecord([]byte(line)) if matched { if i != 6 { t.Error("Matched " + lines[i]) @@ -78,13 +78,13 @@ func TestFilterCombos(t *testing.T) { } } - recordFilter = Filters{nil, nil, nil} - err = recordFilter.AddVgrep(wantCSS) + recordFilter = filters{nil, nil, nil} + err = recordFilter.addVgrep(wantCSS) if err != nil { - t.Error("AddVgrep" + err.Error()) + t.Error("addVgrep" + err.Error()) } for i, line := range lines { - matched := recordFilter.FilterRecord([]byte(line)) + matched := recordFilter.filterRecord([]byte(line)) if !matched { if i != 6 { t.Error("Didn't match " + lines[i]) @@ -96,17 +96,17 @@ func TestFilterCombos(t *testing.T) { } } - recordFilter = Filters{nil, nil, nil} - err = recordFilter.AddGrep("\"GET \\S*-Amazon ") + recordFilter = filters{nil, nil, nil} + err = recordFilter.addGrep("\"GET \\S*-Amazon ") if err != nil { - t.Error("AddGrep " + err.Error()) + t.Error("addGrep " + err.Error()) } - err = recordFilter.AddVgrep("Leaving-") + err = recordFilter.addVgrep("Leaving-") if err != nil { - t.Error("AddVgrep " + err.Error()) + t.Error("addVgrep " + err.Error()) } for i, line := range lines { - matched := recordFilter.FilterRecord([]byte(line)) + matched := recordFilter.filterRecord([]byte(line)) if matched { if i != 4 { t.Error("Matched " + lines[i]) @@ -118,18 +118,18 @@ func TestFilterCombos(t *testing.T) { } } - recordFilter = Filters{nil, nil, nil} - err = recordFilter.AddGrep("\"GET \\S+-Amazon ") + recordFilter = filters{nil, nil, nil} + err = recordFilter.addGrep("\"GET \\S+-Amazon ") if err != nil { - t.Error("AddGrep " + err.Error()) + t.Error("addGrep " + err.Error()) } - err = recordFilter.AddGrep("^54.38.222.160 ") + err = recordFilter.addGrep("^54.38.222.160 ") if err != nil { - t.Error("AddGrep " + err.Error()) + t.Error("addGrep " + err.Error()) } matched := 0 for _, line := range lines { - if recordFilter.FilterRecord([]byte(line)) { + if recordFilter.filterRecord([]byte(line)) { matched++ } } diff --git a/internal/keyfinder.go b/internal/keyfinder.go index 0e21cb6..ad53d88 100644 --- a/internal/keyfinder.go +++ b/internal/keyfinder.go @@ -1,7 +1,7 @@ package topfew -// Extract a key from a record based on a list of keys. If the list is empty, the key is the whole record. -// Otherwise, there's a list of Fields. They are extracted, joined with spaces, and that's the key +// Extract a Key from a record based on a list of keys. If the list is empty, the Key is the whole record. +// Otherwise, there's a list of fields. They are extracted, joined with spaces, and that's the Key // First implementation was regexp based but Golang regexps are slow. So we'll use a hand-built state machine that // only cares whether each byte encodes space-or-tab or not. @@ -10,20 +10,20 @@ import ( "errors" ) -// NER is the error message returned when the input has fewer Fields than the KeyFinder is configured for. +// NER is the error message returned when the input has fewer fields than the keyFinder is configured for. const NER = "not enough bytes in record" -// KeyFinder extracts a key based on the specified Fields from a record. Fields is a slice of small integers +// keyFinder extracts a Key based on the specified fields from a record. fields is a slice of small integers // representing field numbers; 1-based on the command line, 0-based here. -type KeyFinder struct { +type keyFinder struct { fields []uint key []byte } -// NewKeyFinder creates a new key finder with the supplied field numbers, the input should be 1 based. -// KeyFinder is not thread-safe, you should Clone it for each goroutine that uses it. -func NewKeyFinder(keys []uint) *KeyFinder { - kf := KeyFinder{ +// newKeyFinder creates a new Key finder with the supplied field numbers, the input should be 1 based. +// keyFinder is not thread-safe, you should clone it for each goroutine that uses it. +func newKeyFinder(keys []uint) *keyFinder { + kf := keyFinder{ key: make([]byte, 0, 128), } for _, knum := range keys { @@ -32,19 +32,19 @@ func NewKeyFinder(keys []uint) *KeyFinder { return &kf } -// Clone returns a new KeyFinder with the same configuration. Each goroutine should use its own -// KeyFinder instance. -func (kf *KeyFinder) Clone() *KeyFinder { - return &KeyFinder{ +// clone returns a new keyFinder with the same configuration. Each goroutine should use its own +// keyFinder instance. +func (kf *keyFinder) clone() *keyFinder { + return &keyFinder{ fields: kf.fields, key: make([]byte, 0, 128), } } -// GetKey extracts a key from the supplied record. This is applied to every record, +// getKey extracts a key from the supplied record. This is applied to every record, // so efficiency matters. -func (kf *KeyFinder) GetKey(record []byte) ([]byte, error) { - // if there are no key-finders just return the record, minus any trailing newlines +func (kf *keyFinder) getKey(record []byte) ([]byte, error) { + // if there are no Key-finders just return the record, minus any trailing newlines if len(kf.fields) == 0 { if record[len(record)-1] == '\n' { record = record[0 : len(record)-1] @@ -58,9 +58,9 @@ func (kf *KeyFinder) GetKey(record []byte) ([]byte, error) { index := 0 first := true - // for each field in the key + // for each field in the Key for _, keyField := range kf.fields { - // bypass Fields before the one we want + // bypass fields before the one we want for field < int(keyField) { index, err = pass(record, index) if err != nil { @@ -76,7 +76,7 @@ func (kf *KeyFinder) GetKey(record []byte) ([]byte, error) { kf.key = append(kf.key, ' ') } - // attach desired field to key + // attach desired field to Key kf.key, index, err = gather(kf.key, record, index) if err != nil { return nil, err @@ -97,7 +97,7 @@ func gather(key []byte, record []byte, index int) ([]byte, int, error) { return nil, 0, errors.New(NER) } - // copy key bytes + // copy Key bytes for index < len(record) && record[index] != ' ' && record[index] != '\t' && record[index] != '\n' { key = append(key, record[index]) index++ diff --git a/internal/keyfinder_test.go b/internal/keyfinder_test.go index de91d12..21da05c 100644 --- a/internal/keyfinder_test.go +++ b/internal/keyfinder_test.go @@ -11,27 +11,27 @@ func TestKeyFinder(t *testing.T) { "a b c", "a b c d e", } - var kf, kf2 *KeyFinder + var kf, kf2 *keyFinder - kf = NewKeyFinder(nil) - kf2 = NewKeyFinder([]uint{}) + kf = newKeyFinder(nil) + kf2 = newKeyFinder([]uint{}) for _, recordString := range records { record := []byte(recordString) - r, err := kf.GetKey(record) + r, err := kf.getKey(record) if (err != nil) || !bytes.Equal(r, record) { t.Errorf("bad result on nil for %s", record) } - r, err = kf2.GetKey(record) + r, err = kf2.getKey(record) if (err != nil) || !bytes.Equal(r, record) { t.Errorf("bad result on empty for %s", record) } } singles := []string{"x", "b", "b"} - kf = NewKeyFinder([]uint{2}) + kf = newKeyFinder([]uint{2}) for i, record := range records { - k, err := kf.GetKey([]byte(record)) + k, err := kf.getKey([]byte(record)) if err != nil { t.Error("KF fail on: " + record) } else { @@ -41,25 +41,25 @@ func TestKeyFinder(t *testing.T) { } } - kf = NewKeyFinder([]uint{1, 3}) + kf = newKeyFinder([]uint{1, 3}) for _, recordstring := range records { record := []byte(recordstring) - r, err := kf.GetKey(record) + r, err := kf.getKey(record) if err != nil || string(r) != "a c" { t.Errorf("wanted a c from %s, got %s", record, r) } } - kf = NewKeyFinder([]uint{1, 4}) + kf = newKeyFinder([]uint{1, 4}) tooShorts := []string{"a", "a b", "a b c"} for _, tooShortString := range tooShorts { tooShort := []byte(tooShortString) - _, err := kf.GetKey(tooShort) + _, err := kf.getKey(tooShort) if err == nil { t.Errorf("no error on %s", tooShort) } } - r, err := kf.GetKey([]byte("a b c d")) + r, err := kf.getKey([]byte("a b c d")) if err != nil || string(r) != "a d" { t.Error("border condition") } diff --git a/internal/runner.go b/internal/runner.go index 0065cbd..c83763d 100644 --- a/internal/runner.go +++ b/internal/runner.go @@ -6,33 +6,33 @@ import ( "os" ) -func Run(config *Config, instream io.Reader) ([]*KeyCount, error) { +func Run(config *config, instream io.Reader) ([]*keyCount, error) { // lifted out of main.go to facilitate testing - var kf = NewKeyFinder(config.Fields) - var topList []*KeyCount + var kf = newKeyFinder(config.fields) + var topList []*keyCount var err error if config.Fname == "" { - if config.Sample { - for i, sed := range config.Filter.Seds { + if config.sample { + for i, sed := range config.filter.seds { fmt.Printf("SED %d: s/%s/%s/\n", i, sed.ReplaceThis, sed.WithThat) } - err = Sample(instream, &config.Filter, kf) + err = sample(instream, &config.filter, kf) } else { - topList, err = FromStream(instream, &config.Filter, kf, config.Size) + topList, err = fromStream(instream, &config.filter, kf, config.size) } if err != nil { _, _ = fmt.Fprintf(os.Stderr, "Error reading stream: %s\n", err.Error()) return nil, err } } else { - counter := NewCounter(config.Size) - err = ReadFileInSegments(config.Fname, &config.Filter, counter, kf, config.Width) + counter := newCounter(config.size) + err = readFileInSegments(config.Fname, &config.filter, counter, kf, config.width) if err != nil { _, _ = fmt.Fprintf(os.Stderr, "Error processing %s: %s\n", config.Fname, err.Error()) return nil, err } - topList = counter.GetTop() + topList = counter.getTop() } return topList, err diff --git a/internal/runner_test.go b/internal/runner_test.go index fd27720..52643a3 100644 --- a/internal/runner_test.go +++ b/internal/runner_test.go @@ -65,7 +65,7 @@ func TestBadFile(t *testing.T) { args := []string{"/nosuch"} c, err := Configure(args) if err != nil { - t.Error("Config!") + t.Error("config!") } _, err = Run(c, nil) if err == nil { diff --git a/internal/sampler.go b/internal/sampler.go index 4d845d9..6e5886d 100644 --- a/internal/sampler.go +++ b/internal/sampler.go @@ -8,8 +8,8 @@ import ( "io" ) -// Sample prints out what amounts to a debugging feed, showing how the filtering and keyrewriting are working. -func Sample(ioReader io.Reader, filters *Filters, kf *KeyFinder) error { +// sample prints out what amounts to a debugging feed, showing how the filtering and keyrewriting are working. +func sample(ioReader io.Reader, filters *filters, kf *keyFinder) error { reader := bufio.NewReader(ioReader) for { record, err := reader.ReadBytes('\n') @@ -19,18 +19,18 @@ func Sample(ioReader io.Reader, filters *Filters, kf *KeyFinder) error { return err } - if filters.FilterRecord(record) { + if filters.filterRecord(record) { fmt.Print(" ACCEPT: " + string(record)) } else { fmt.Print(" REJECT: " + string(record)) continue } - keyBytes, err := kf.GetKey(record) + keyBytes, err := kf.getKey(record) if err != nil { return err } - filtered := filters.FilterField(keyBytes) + filtered := filters.filterField(keyBytes) if bytes.Equal(keyBytes, filtered) { fmt.Printf("KEY AS IS: %s\n", string(filtered)) } else { diff --git a/internal/segmenter.go b/internal/segmenter.go index f9d2906..0249fba 100644 --- a/internal/segmenter.go +++ b/internal/segmenter.go @@ -9,18 +9,18 @@ import ( "runtime" ) -// Segment represents a segment of a file. Is required to begin at the start of a line, i.e. start of file or +// segment represents a segment of a file. Is required to begin at the start of a line, i.e. start of file or // after a \n. -type Segment struct { +type segment struct { start int64 end int64 file *os.File } -// ReadFileInSegments breaks the file up into multiple segments and then reads them in parallel. counter +// readFileInSegments breaks the file up into multiple segments and then reads them in parallel. counter // will be updated with the resulting occurrence counts. -func ReadFileInSegments(fname string, filter *Filters, counter *Counter, kf *KeyFinder, width int) error { - // find file Size +func readFileInSegments(fname string, filter *filters, counter *counter, kf *keyFinder, width int) error { + // find file size file, err := os.Open(fname) if err != nil { return err @@ -40,7 +40,7 @@ func ReadFileInSegments(fname string, filter *Filters, counter *Counter, kf *Key } // compute segments and put them in a slice - var segments []*Segment + var segments []*segment base := int64(0) for base < fileSize { // each segment starts at the beginning of a line and ends after a newline (or at EOF) @@ -62,13 +62,13 @@ func ReadFileInSegments(fname string, filter *Filters, counter *Counter, kf *Key if res.err != nil { return err } - counter.merge(res.counters) + counter.merge(res.segCounter) } return nil } // the start value is guaranteed to be at file start or after newline -func newSegment(fname string, start int64, end int64) (*Segment, error) { +func newSegment(fname string, start int64, end int64) (*segment, error) { // All these "err != nil" tests on basic filesystem seek operations are probably superfluous and // drive down the test coverage @@ -106,29 +106,29 @@ func newSegment(fname string, start int64, end int64) (*Segment, error) { if offset != start { return nil, fmt.Errorf("tried to seek to %d, went to %d", start, offset) } - return &Segment{start, end, file}, nil + return &segment{start, end, file}, nil } type segmentResult struct { // one of these will be set - err error - counters segmentCounter + err error + segCounter segmentCounter } // we've already opened the file and seeked to the right place -func readSegment(s *Segment, filter *Filters, kf *KeyFinder, reportCh chan segmentResult) { +func readSegment(s *segment, filter *filters, kf *keyFinder, reportCh chan segmentResult) { // noinspection ALL defer s.file.Close() reader := bufio.NewReaderSize(s.file, 16*1024) current := s.start - counters := newSegmentCounter() - kf = kf.Clone() + segCounter := newSegmentCounter() + kf = kf.clone() for current < s.end { // ReadSlice results are only valid until the next call to Read, so we need // to be careful about how long we hang onto the record slice. The SegmentCounter // is the only thing that holds onto data from record, and it has to make a copy - // anyway when it constructs its string key. So this is safe. + // anyway when it constructs its string Key. So this is safe. record, err := reader.ReadSlice('\n') // ReadSlice returns an error if a line doesn't fit in its buffer. We // deal with that by switching to ReadBytes to get the remainder of the line. @@ -145,16 +145,16 @@ func readSegment(s *Segment, filter *Filters, kf *KeyFinder, reportCh chan segme return } current += int64(len(record)) - if !filter.FilterRecord(record) { + if !filter.filterRecord(record) { continue } - keyBytes, err := kf.GetKey(record) + keyBytes, err := kf.getKey(record) if err != nil { // bypass - _, _ = fmt.Fprintf(os.Stderr, "Can't extract key from %s\n", string(record)) + _, _ = fmt.Fprintf(os.Stderr, "Can't extract Key from %s\n", string(record)) continue } - counters.Add(filter.FilterField(keyBytes)) + segCounter.add(filter.filterField(keyBytes)) } - reportCh <- segmentResult{counters: counters} + reportCh <- segmentResult{segCounter: segCounter} } diff --git a/internal/segmenter_test.go b/internal/segmenter_test.go index 05053e5..e4235b5 100644 --- a/internal/segmenter_test.go +++ b/internal/segmenter_test.go @@ -20,20 +20,20 @@ func TestReadAll(t *testing.T) { if offs != 4176 || err != nil { t.Error("OUCH") } - s := Segment{4176, 4951, file} - kf := NewKeyFinder([]uint{7}) + s := segment{4176, 4951, file} + kf := newKeyFinder([]uint{7}) ch := make(chan segmentResult) - f := Filters{nil, nil, nil} + f := filters{nil, nil, nil} go readSegment(&s, &f, kf, ch) segres := <-ch if segres.err != nil { t.Fatalf("got error from segment reader %v", segres.err) } - counter := NewCounter(10) - counter.merge(segres.counters) + counter := newCounter(10) + counter.merge(segres.segCounter) - res := counter.GetTop() + res := counter.getTop() var want = map[string]bool{ "/ongoing/picInfo.xml?o=https://old.tbray.org/ongoing/": true, "/ongoing/in-feed.xml": true, @@ -65,7 +65,7 @@ func TestReadSegmentFiltering(t *testing.T) { args := []string{"-f", "2", "-g", "foo"} c, err := Configure(args) if err != nil { - t.Error("Config!") + t.Error("config!") } tmpName := fmt.Sprintf("/tmp/tf-%d", os.Getpid()) @@ -76,8 +76,8 @@ func TestReadSegmentFiltering(t *testing.T) { defer func() { _ = os.Remove(tmpName) }() _, _ = fmt.Fprint(tmpfile, input) _ = tmpfile.Close() - counter := NewCounter(10) - err = ReadFileInSegments(tmpName, &c.Filter, counter, NewKeyFinder(c.Fields), 1) + counter := newCounter(10) + err = readFileInSegments(tmpName, &c.filter, counter, newKeyFinder(c.fields), 1) if err != nil { t.Error("Run? " + err.Error()) } @@ -109,15 +109,15 @@ func TestVeryLongLines(t *testing.T) { _, _ = fmt.Fprintln(tmpfile, c3) } _ = tmpfile.Close() - counter := NewCounter(10) - err = ReadFileInSegments(tmpName, &Filters{}, counter, NewKeyFinder(nil), 1) + counter := newCounter(10) + err = readFileInSegments(tmpName, &filters{}, counter, newKeyFinder(nil), 1) if err != nil { t.Fatal("Failed to read long-lines file") } assertKeyCountsEqual(t, - []*KeyCount{ + []*keyCount{ {a80k, pv(5)}, {c3, pv(3)}, {b30k, pv(2)}}, - counter.GetTop()) + counter.getTop()) } diff --git a/internal/stream.go b/internal/stream.go index d2e9171..e3a82a9 100644 --- a/internal/stream.go +++ b/internal/stream.go @@ -8,9 +8,9 @@ import ( "os" ) -// FromStream reads a stream and hands each line to the top-occurrence counter. Currently only used on stdin. -func FromStream(ioReader io.Reader, filters *Filters, kf *KeyFinder, size int) ([]*KeyCount, error) { - counter := NewCounter(size) +// fromStream reads a stream and hands each line to the top-occurrence counter. Currently only used on stdin. +func fromStream(ioReader io.Reader, filters *filters, kf *keyFinder, size int) ([]*keyCount, error) { + counter := newCounter(size) reader := bufio.NewReader(ioReader) for { record, err := reader.ReadBytes('\n') @@ -20,18 +20,18 @@ func FromStream(ioReader io.Reader, filters *Filters, kf *KeyFinder, size int) ( return nil, err } - if !filters.FilterRecord(record) { + if !filters.filterRecord(record) { continue } - keyBytes, err := kf.GetKey(record) + keyBytes, err := kf.getKey(record) if err != nil { // bypass - _, _ = fmt.Fprintf(os.Stderr, "Can't extract key from %s\n", string(record)) + _, _ = fmt.Fprintf(os.Stderr, "Can't extract Key from %s\n", string(record)) continue } - keyBytes = filters.FilterField(keyBytes) + keyBytes = filters.filterField(keyBytes) - counter.Add(keyBytes) + counter.add(keyBytes) } - return counter.GetTop(), nil + return counter.getTop(), nil } diff --git a/internal/stream_test.go b/internal/stream_test.go index 62961bb..ac9af42 100644 --- a/internal/stream_test.go +++ b/internal/stream_test.go @@ -11,10 +11,10 @@ func TestBadStreamReader(t *testing.T) { args := []string{} c, err := Configure(args) if err != nil { - t.Error("Config!") + t.Error("config!") } cer := newCER("testing stream") - _, err = FromStream(cer, &c.Filter, nil, c.Size) + _, err = fromStream(cer, &c.filter, nil, c.size) if err == nil { t.Error("survived err from Read") } @@ -27,7 +27,7 @@ func TestStreamProcessing(t *testing.T) { args := []string{"-f", "3", "--vgrep", "FOO"} c, err := Configure(args) if err != nil { - t.Error("Config: " + err.Error()) + t.Error("config: " + err.Error()) } input := "FOO\nBAR\n" stringreader := bufio.NewReader(strings.NewReader(input)) @@ -48,9 +48,9 @@ func Test1KLinesStream(t *testing.T) { //noinspection ALL defer file.Close() - kf := NewKeyFinder([]uint{1}) - f := Filters{nil, nil, nil} - x, err := FromStream(file, &f, kf, 5) + kf := newKeyFinder([]uint{1}) + f := filters{nil, nil, nil} + x, err := fromStream(file, &f, kf, 5) if err != nil { t.Error("OUCH: " + err.Error()) } @@ -68,7 +68,7 @@ func Test1KLinesStream(t *testing.T) { for _, kc := range x { if *kc.Count != uint64(wanted[kc.Key]) { - t.Error("Wrong count for key: " + kc.Key) + t.Error("Wrong count for Key: " + kc.Key) } } }