Skip to content

Commit

Permalink
Merge pull request #23 from timbray/issue-20
Browse files Browse the repository at this point in the history
add regexp field separator option
  • Loading branch information
timbray authored Apr 25, 2024
2 parents dec140a + 9f681ac commit ecad0df
Show file tree
Hide file tree
Showing 9 changed files with 163 additions and 48 deletions.
15 changes: 13 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -50,15 +50,24 @@ All the arguments are optional; if none are provided, tf will read records
from the standard input and list the 10 which occur most often.
```
## Options
`-n integer`, `--number integer` How many of the highest‐occurrence‐count lines to print out.
`-n integer`, `--number integer`
How many of the highest‐occurrence‐count lines to print out.
The default value is 10.
`-f fieldlist, --fields fieldlist` Specifies which fields should be extracted from incoming records and used in computing occurrence counts.
`-f fieldlist, --fields fieldlist`
Specifies which fields should be extracted from incoming records and used in computing occurrence counts.
The fieldlist must be a comma‐separated list of integers identifying field numbers, which start at one, for example 3 and 2,5,6.
The fields must be provided in order, so 3,1,7 is an error.
If no fieldlist is provided, **tf** treats the whole input record as a single field.
`-p separator, --fieldseparator separator`
Provides a regular expression that is used as a field separator instead of the default white space.
This is likely to incur a significant performance cost.
`-g regexp`, `--grep regexp`
The initial **g** suggests `grep`.
Expand Down Expand Up @@ -101,6 +110,8 @@ Describes the function and options of **tf**.
Records are separated by newlines, fields within records by white space, defined as one or more space or tab characters.
The field separator can be overridden with the --fieldseparator option.
## Performance issues
Since the effect of topfew can be exactly duplicated with a combination of `awk`, `grep`, `sed` and `sort`, you wouldn’t be using it if you didn’t care about performance.
Expand Down
26 changes: 20 additions & 6 deletions internal/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,17 +4,19 @@ import (
"errors"
"fmt"
"os"
"regexp"
"strconv"
"strings"
)

type config struct {
size int
fields []uint
Fname string
filter filters
width int
sample bool
size int
fields []uint
fieldSeparator *regexp.Regexp
Fname string
filter filters
width int
sample bool
}

func Configure(args []string) (*config, error) {
Expand Down Expand Up @@ -43,6 +45,13 @@ func Configure(args []string) (*config, error) {
i++
config.fields, err = parseFields(args[i])
}
case arg == "-p" || arg == "--fieldseparator":
if (i + 1) >= len(args) {
err = errors.New("insufficient arguments for --fieldseparator")
} else {
i++
config.fieldSeparator, err = regexp.Compile(args[i])
}
case arg == "-g" || arg == "--grep":
if (i + 1) >= len(args) {
err = errors.New("insufficient arguments for --grep")
Expand Down Expand Up @@ -123,6 +132,7 @@ order of occurrences.
Usage: tf
-n, --number (output line count) [default is 10]
-f, --fields (field list) [default is the whole record]
-p, --fieldseparator (field separator regex) [default is white space]
-g, --grep (regexp) [may repeat, default is accept all]
-v, --vgrep (regexp) [may repeat, default is reject none]
-s, --sed (regexp) (replacement) [may repeat, default is no changes]
Expand All @@ -137,6 +147,10 @@ from the standard input and list the 10 which occur most often.
Field list is comma-separated integers, e.g. -f 3 or --fields 1,3,7. The fields
must be provided in order, so 3,1,7 is an error.
Fields are separated by white space (spaces or tabs) by default.
This can be overridden with the --fieldseparator option, at some cost in
performance.
The regexp-valued fields work as follows:
-g/--grep discards records that don't match the regexp (g for grep)
-v/--vgrep discards records that do match the regexp (v for grep -v)
Expand Down
2 changes: 2 additions & 0 deletions internal/config_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ func TestArgSyntax(t *testing.T) {
{"--sample", "--trace"}, {"--trace"},
{"--sed"}, {"-s", "x"}, {"--sample", "--sed", "1"},
{"--width", "a"}, {"-w", "0"}, {"--sample", "-w"},
{"--sample", "-p"}, {"--fieldseparator", "a["},
}

// not testing -h/--help because it'd be extra work to avoid printing out the usage
Expand All @@ -26,6 +27,7 @@ func TestArgSyntax(t *testing.T) {
{"--sample"},
{"--width", "2"}, {"-w", "3"},
{"--sample", "fname"},
{"-p", "a[bc]*d$"},
}

for _, bad := range bads {
Expand Down
77 changes: 47 additions & 30 deletions internal/keyfinder.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ package topfew

import (
"errors"
"regexp"
)

// NER is the error message returned when the input has fewer fields than the keyFinder is configured for.
Expand All @@ -16,73 +17,89 @@ const NER = "not enough bytes in record"
// keyFinder extracts a Key based on the specified fields from a record. fields is a slice of small integers
// representing field numbers; 1-based on the command line, 0-based here.
type keyFinder struct {
fields []uint
key []byte
fields []uint
key []byte
separator *regexp.Regexp
}

// newKeyFinder creates a new Key finder with the supplied field numbers, the input should be 1 based.
// keyFinder is not thread-safe, you should clone it for each goroutine that uses it.
func newKeyFinder(keys []uint) *keyFinder {
func newKeyFinder(keys []uint, separator *regexp.Regexp) *keyFinder {
kf := keyFinder{
key: make([]byte, 0, 128),
}
for _, knum := range keys {
kf.fields = append(kf.fields, knum-1)
}
kf.separator = separator
return &kf
}

// clone returns a new keyFinder with the same configuration. Each goroutine should use its own
// keyFinder instance.
func (kf *keyFinder) clone() *keyFinder {
return &keyFinder{
fields: kf.fields,
key: make([]byte, 0, 128),
fields: kf.fields,
key: make([]byte, 0, 128),
separator: kf.separator,
}
}

// getKey extracts a key from the supplied record. This is applied to every record,
// so efficiency matters.
func (kf *keyFinder) getKey(record []byte) ([]byte, error) {
// if there are no Key-finders just return the record, minus any trailing newlines
if len(kf.fields) == 0 {
if len(kf.fields) == 0 && kf.separator == nil {
if record[len(record)-1] == '\n' {
record = record[0 : len(record)-1]
}
return record, nil
}

var err error
kf.key = kf.key[:0]
field := 0
index := 0
first := true
if kf.separator == nil {
kf.key = kf.key[:0]
field := 0
index := 0
first := true

// for each field in the Key
for _, keyField := range kf.fields {
// bypass fields before the one we want
for field < int(keyField) {
index, err = pass(record, index)
if err != nil {
return nil, err
}
field++
}

// join(' ', kf)
if first {
first = false
} else {
kf.key = append(kf.key, ' ')
}

// for each field in the Key
for _, keyField := range kf.fields {
// bypass fields before the one we want
for field < int(keyField) {
index, err = pass(record, index)
// attach desired field to Key
kf.key, index, err = gather(kf.key, record, index)
if err != nil {
return nil, err
}
field++
}

// join(' ', kf)
if first {
first = false
} else {
kf.key = append(kf.key, ' ')
field++
}

// attach desired field to Key
kf.key, index, err = gather(kf.key, record, index)
if err != nil {
return nil, err
} else {
kf.key = kf.key[:0]
allFields := kf.separator.Split(string(record), -1)
for i, field := range kf.fields {
if int(field) >= len(allFields) {
return nil, errors.New(NER)
}
if i > 0 {
kf.key = append(kf.key, ' ')
}
kf.key = append(kf.key, []byte(allFields[field])...)
}

field++
}
return kf.key, err
}
Expand Down
71 changes: 66 additions & 5 deletions internal/keyfinder_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,70 @@ package topfew

import (
"bytes"
"os"
"testing"
)

func TestFieldSeparator(t *testing.T) {
args := []string{"-p", "tt*", "-f", "2,4"}

c, err := Configure(args)
if err != nil {
t.Error("Config!")
}

records := []string{
"atbttctttttdtttte",
}
wanted := []string{
"b d",
}
kf := newKeyFinder(c.fields, c.fieldSeparator)
for i, record := range records {
got, err := kf.getKey([]byte(record))
if err != nil {
t.Error("getKey: " + err.Error())
}
if string(got) != wanted[i] {
t.Errorf("wanted %s got %s", wanted[i], string(got))
}
}
_, err = kf.getKey([]byte("atbtc"))
if err == nil || err.Error() != NER {
t.Error("bad error value")
}
}

func TestCSVSeparator(t *testing.T) {
args := []string{"-p", ",", "-f", "11"}
c, err := Configure(args)
if err != nil {
t.Error("Config!")
}
input, err := os.Open("../test/data/csoc.csv")
if err != nil {
t.Error("Open: " + err.Error())
}
counts, err := Run(c, input)
if err != nil {
t.Error("Run: " + err.Error())
}
if len(counts) != 5 {
t.Errorf("Got %d results, wanted 5", len(counts))
}
wantCounts := []uint64{4, 2, 1, 1, 1}
wantKeys := []string{"50", "-1.97", "amount", "-1.75", "-1.9"}
for i, count := range counts {
if *count.Count != wantCounts[i] {
t.Errorf("Counts[%d] is %d wanted %d", i, *count.Count, wantCounts[i])
}
// because for equal values, the sort isn't stable - Counts[2,3,4] are all 1
if i < 2 && count.Key != wantKeys[i] {
t.Errorf("Keys[%d] is %s wanted %s", i, count.Key, wantKeys[i])
}
}
}

func TestKeyFinder(t *testing.T) {
var records = []string{
"a x c",
Expand All @@ -13,8 +74,8 @@ func TestKeyFinder(t *testing.T) {
}
var kf, kf2 *keyFinder

kf = newKeyFinder(nil)
kf2 = newKeyFinder([]uint{})
kf = newKeyFinder(nil, nil)
kf2 = newKeyFinder([]uint{}, nil)

for _, recordString := range records {
record := []byte(recordString)
Expand All @@ -29,7 +90,7 @@ func TestKeyFinder(t *testing.T) {
}

singles := []string{"x", "b", "b"}
kf = newKeyFinder([]uint{2})
kf = newKeyFinder([]uint{2}, nil)
for i, record := range records {
k, err := kf.getKey([]byte(record))
if err != nil {
Expand All @@ -41,7 +102,7 @@ func TestKeyFinder(t *testing.T) {
}
}

kf = newKeyFinder([]uint{1, 3})
kf = newKeyFinder([]uint{1, 3}, nil)
for _, recordstring := range records {
record := []byte(recordstring)
r, err := kf.getKey(record)
Expand All @@ -50,7 +111,7 @@ func TestKeyFinder(t *testing.T) {
}
}

kf = newKeyFinder([]uint{1, 4})
kf = newKeyFinder([]uint{1, 4}, nil)
tooShorts := []string{"a", "a b", "a b c"}
for _, tooShortString := range tooShorts {
tooShort := []byte(tooShortString)
Expand Down
2 changes: 1 addition & 1 deletion internal/runner.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ import (

func Run(config *config, instream io.Reader) ([]*keyCount, error) {
// lifted out of main.go to facilitate testing
var kf = newKeyFinder(config.fields)
var kf = newKeyFinder(config.fields, config.fieldSeparator)
var topList []*keyCount
var err error

Expand Down
6 changes: 3 additions & 3 deletions internal/segmenter_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ func TestReadAll(t *testing.T) {
t.Error("OUCH")
}
s := segment{4176, 4951, file}
kf := newKeyFinder([]uint{7})
kf := newKeyFinder([]uint{7}, nil)
ch := make(chan segmentResult)
f := filters{nil, nil, nil}
go readSegment(&s, &f, kf, ch)
Expand Down Expand Up @@ -77,7 +77,7 @@ func TestReadSegmentFiltering(t *testing.T) {
_, _ = fmt.Fprint(tmpfile, input)
_ = tmpfile.Close()
counter := newCounter(10)
err = readFileInSegments(tmpName, &c.filter, counter, newKeyFinder(c.fields), 1)
err = readFileInSegments(tmpName, &c.filter, counter, newKeyFinder(c.fields, nil), 1)
if err != nil {
t.Error("Run? " + err.Error())
}
Expand Down Expand Up @@ -110,7 +110,7 @@ func TestVeryLongLines(t *testing.T) {
}
_ = tmpfile.Close()
counter := newCounter(10)
err = readFileInSegments(tmpName, &filters{}, counter, newKeyFinder(nil), 1)
err = readFileInSegments(tmpName, &filters{}, counter, newKeyFinder(nil, nil), 1)
if err != nil {
t.Fatal("Failed to read long-lines file")
}
Expand Down
2 changes: 1 addition & 1 deletion internal/stream_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ func Test1KLinesStream(t *testing.T) {
//noinspection ALL
defer file.Close()

kf := newKeyFinder([]uint{1})
kf := newKeyFinder([]uint{1}, nil)
f := filters{nil, nil, nil}
x, err := fromStream(file, &f, kf, 5)
if err != nil {
Expand Down
Loading

0 comments on commit ecad0df

Please sign in to comment.