Merge pull request #23 from timbray/issue-20

add regexp field separator option
timbray · Apr 25, 2024 · ecad0df · ecad0df
2 parents dec140a + 9f681ac
commit ecad0df
Show file tree

Hide file tree

Showing 9 changed files with 163 additions and 48 deletions.
diff --git a/README.md b/README.md
@@ -50,15 +50,24 @@ All the arguments are optional; if none are provided, tf will read records
 from the standard input and list the 10 which occur most often.
 ```
 ## Options
-`-n integer`, `--number integer` How many of the highest‐occurrence‐count lines to print out. 
+`-n integer`, `--number integer` 
+
+How many of the highest‐occurrence‐count lines to print out. 
 The default value is 10.
 
-`-f fieldlist, --fields fieldlist` Specifies which fields should be extracted from incoming records and used in computing occurrence counts.
+`-f fieldlist, --fields fieldlist`
+
+Specifies which fields should be extracted from incoming records and used in computing occurrence counts.
 The fieldlist must be a comma‐separated  list  of  integers  identifying  field numbers, which start at one, for example 3 and 2,5,6.
 The fields must be provided in order, so 3,1,7 is an error.
 
 If no fieldlist is provided, **tf** treats the whole input record as a single field.
 
+`-p separator, --fieldseparator separator` 
+
+Provides a regular expression that is used as a field separator instead of the default white space.
+This is likely to incur a significant performance cost.
+
 `-g regexp`, `--grep regexp`
 
 The  initial **g** suggests `grep`.
@@ -101,6 +110,8 @@ Describes the function and options of **tf**.
 
 Records are separated by newlines, fields within records by white space, defined as one or more space or tab characters.
 
+The field separator can be overridden with the --fieldseparator option.
+
 ## Performance issues
 
 Since the effect of topfew can be exactly duplicated with a combination of `awk`, `grep`, `sed` and `sort`, you wouldn’t be using it if you didn’t care about performance. 

diff --git a/internal/config.go b/internal/config.go
@@ -4,17 +4,19 @@ import (
 	"errors"
 	"fmt"
 	"os"
+	"regexp"
 	"strconv"
 	"strings"
 )
 
 type config struct {
-	size   int
-	fields []uint
-	Fname  string
-	filter filters
-	width  int
-	sample bool
+	size           int
+	fields         []uint
+	fieldSeparator *regexp.Regexp
+	Fname          string
+	filter         filters
+	width          int
+	sample         bool
 }
 
 func Configure(args []string) (*config, error) {
@@ -43,6 +45,13 @@ func Configure(args []string) (*config, error) {
 				i++
 				config.fields, err = parseFields(args[i])
 			}
+		case arg == "-p" || arg == "--fieldseparator":
+			if (i + 1) >= len(args) {
+				err = errors.New("insufficient arguments for --fieldseparator")
+			} else {
+				i++
+				config.fieldSeparator, err = regexp.Compile(args[i])
+			}
 		case arg == "-g" || arg == "--grep":
 			if (i + 1) >= len(args) {
 				err = errors.New("insufficient arguments for --grep")
@@ -123,6 +132,7 @@ order of occurrences.
 Usage: tf
 	-n, --number (output line count) [default is 10]
 	-f, --fields (field list) [default is the whole record]
+	-p, --fieldseparator (field separator regex) [default is white space]
 	-g, --grep (regexp) [may repeat, default is accept all]
 	-v, --vgrep (regexp) [may repeat, default is reject none]
 	-s, --sed (regexp) (replacement) [may repeat, default is no changes]
@@ -137,6 +147,10 @@ from the standard input and list the 10 which occur most often.
 Field list is comma-separated integers, e.g. -f 3 or --fields 1,3,7. The fields
 must be provided in order, so 3,1,7 is an error.
 
+Fields are separated by white space (spaces or tabs) by default.
+This can be overridden with the --fieldseparator option, at some cost in
+performance.
+
 The regexp-valued fields work as follows:
 -g/--grep discards records that don't match the regexp (g for grep)
 -v/--vgrep discards records that do match the regexp (v for grep -v)

diff --git a/internal/config_test.go b/internal/config_test.go
@@ -14,6 +14,7 @@ func TestArgSyntax(t *testing.T) {
 		{"--sample", "--trace"}, {"--trace"},
 		{"--sed"}, {"-s", "x"}, {"--sample", "--sed", "1"},
 		{"--width", "a"}, {"-w", "0"}, {"--sample", "-w"},
+		{"--sample", "-p"}, {"--fieldseparator", "a["},
 	}
 
 	// not testing -h/--help because it'd be extra work to avoid printing out the usage
@@ -26,6 +27,7 @@ func TestArgSyntax(t *testing.T) {
 		{"--sample"},
 		{"--width", "2"}, {"-w", "3"},
 		{"--sample", "fname"},
+		{"-p", "a[bc]*d$"},
 	}
 
 	for _, bad := range bads {

diff --git a/internal/keyfinder.go b/internal/keyfinder.go
@@ -8,6 +8,7 @@ package topfew
 
 import (
 	"errors"
+	"regexp"
 )
 
 // NER is the error message returned when the input has fewer fields than the keyFinder is configured for.
@@ -16,73 +17,89 @@ const NER = "not enough bytes in record"
 // keyFinder extracts a Key based on the specified fields from a record. fields is a slice of small integers
 // representing field numbers; 1-based on the command line, 0-based here.
 type keyFinder struct {
-	fields []uint
-	key    []byte
+	fields    []uint
+	key       []byte
+	separator *regexp.Regexp
 }
 
 // newKeyFinder creates a new Key finder with the supplied field numbers, the input should be 1 based.
 // keyFinder is not thread-safe, you should clone it for each goroutine that uses it.
-func newKeyFinder(keys []uint) *keyFinder {
+func newKeyFinder(keys []uint, separator *regexp.Regexp) *keyFinder {
 	kf := keyFinder{
 		key: make([]byte, 0, 128),
 	}
 	for _, knum := range keys {
 		kf.fields = append(kf.fields, knum-1)
 	}
+	kf.separator = separator
 	return &kf
 }
 
 // clone returns a new keyFinder with the same configuration. Each goroutine should use its own
 // keyFinder instance.
 func (kf *keyFinder) clone() *keyFinder {
 	return &keyFinder{
-		fields: kf.fields,
-		key:    make([]byte, 0, 128),
+		fields:    kf.fields,
+		key:       make([]byte, 0, 128),
+		separator: kf.separator,
 	}
 }
 
 // getKey extracts a key from the supplied record. This is applied to every record,
 // so efficiency matters.
 func (kf *keyFinder) getKey(record []byte) ([]byte, error) {
 	// if there are no Key-finders just return the record, minus any trailing newlines
-	if len(kf.fields) == 0 {
+	if len(kf.fields) == 0 && kf.separator == nil {
 		if record[len(record)-1] == '\n' {
 			record = record[0 : len(record)-1]
 		}
 		return record, nil
 	}
-
 	var err error
-	kf.key = kf.key[:0]
-	field := 0
-	index := 0
-	first := true
+	if kf.separator == nil {
+		kf.key = kf.key[:0]
+		field := 0
+		index := 0
+		first := true
+
+		// for each field in the Key
+		for _, keyField := range kf.fields {
+			// bypass fields before the one we want
+			for field < int(keyField) {
+				index, err = pass(record, index)
+				if err != nil {
+					return nil, err
+				}
+				field++
+			}
+
+			// join(' ', kf)
+			if first {
+				first = false
+			} else {
+				kf.key = append(kf.key, ' ')
+			}
 
-	// for each field in the Key
-	for _, keyField := range kf.fields {
-		// bypass fields before the one we want
-		for field < int(keyField) {
-			index, err = pass(record, index)
+			// attach desired field to Key
+			kf.key, index, err = gather(kf.key, record, index)
 			if err != nil {
 				return nil, err
 			}
-			field++
-		}
 
-		// join(' ', kf)
-		if first {
-			first = false
-		} else {
-			kf.key = append(kf.key, ' ')
+			field++
 		}
-
-		// attach desired field to Key
-		kf.key, index, err = gather(kf.key, record, index)
-		if err != nil {
-			return nil, err
+	} else {
+		kf.key = kf.key[:0]
+		allFields := kf.separator.Split(string(record), -1)
+		for i, field := range kf.fields {
+			if int(field) >= len(allFields) {
+				return nil, errors.New(NER)
+			}
+			if i > 0 {
+				kf.key = append(kf.key, ' ')
+			}
+			kf.key = append(kf.key, []byte(allFields[field])...)
 		}
-
-		field++
 	}
 	return kf.key, err
 }

diff --git a/internal/keyfinder_test.go b/internal/keyfinder_test.go
@@ -2,9 +2,70 @@ package topfew
 
 import (
 	"bytes"
+	"os"
 	"testing"
 )
 
+func TestFieldSeparator(t *testing.T) {
+	args := []string{"-p", "tt*", "-f", "2,4"}
+
+	c, err := Configure(args)
+	if err != nil {
+		t.Error("Config!")
+	}
+
+	records := []string{
+		"atbttctttttdtttte",
+	}
+	wanted := []string{
+		"b d",
+	}
+	kf := newKeyFinder(c.fields, c.fieldSeparator)
+	for i, record := range records {
+		got, err := kf.getKey([]byte(record))
+		if err != nil {
+			t.Error("getKey: " + err.Error())
+		}
+		if string(got) != wanted[i] {
+			t.Errorf("wanted %s got %s", wanted[i], string(got))
+		}
+	}
+	_, err = kf.getKey([]byte("atbtc"))
+	if err == nil || err.Error() != NER {
+		t.Error("bad error value")
+	}
+}
+
+func TestCSVSeparator(t *testing.T) {
+	args := []string{"-p", ",", "-f", "11"}
+	c, err := Configure(args)
+	if err != nil {
+		t.Error("Config!")
+	}
+	input, err := os.Open("../test/data/csoc.csv")
+	if err != nil {
+		t.Error("Open: " + err.Error())
+	}
+	counts, err := Run(c, input)
+	if err != nil {
+		t.Error("Run: " + err.Error())
+	}
+	if len(counts) != 5 {
+		t.Errorf("Got %d results, wanted 5", len(counts))
+	}
+	wantCounts := []uint64{4, 2, 1, 1, 1}
+	wantKeys := []string{"50", "-1.97", "amount", "-1.75", "-1.9"}
+	for i, count := range counts {
+		if *count.Count != wantCounts[i] {
+			t.Errorf("Counts[%d] is %d wanted %d", i, *count.Count, wantCounts[i])
+		}
+		// because for equal values, the sort isn't stable - Counts[2,3,4] are all 1
+		if i < 2 && count.Key != wantKeys[i] {
+			t.Errorf("Keys[%d] is %s wanted %s", i, count.Key, wantKeys[i])
+		}
+	}
+}
+
 func TestKeyFinder(t *testing.T) {
 	var records = []string{
 		"a x c",
@@ -13,8 +74,8 @@ func TestKeyFinder(t *testing.T) {
 	}
 	var kf, kf2 *keyFinder
 
-	kf = newKeyFinder(nil)
-	kf2 = newKeyFinder([]uint{})
+	kf = newKeyFinder(nil, nil)
+	kf2 = newKeyFinder([]uint{}, nil)
 
 	for _, recordString := range records {
 		record := []byte(recordString)
@@ -29,7 +90,7 @@ func TestKeyFinder(t *testing.T) {
 	}
 
 	singles := []string{"x", "b", "b"}
-	kf = newKeyFinder([]uint{2})
+	kf = newKeyFinder([]uint{2}, nil)
 	for i, record := range records {
 		k, err := kf.getKey([]byte(record))
 		if err != nil {
@@ -41,7 +102,7 @@ func TestKeyFinder(t *testing.T) {
 		}
 	}
 
-	kf = newKeyFinder([]uint{1, 3})
+	kf = newKeyFinder([]uint{1, 3}, nil)
 	for _, recordstring := range records {
 		record := []byte(recordstring)
 		r, err := kf.getKey(record)
@@ -50,7 +111,7 @@ func TestKeyFinder(t *testing.T) {
 		}
 	}
 
-	kf = newKeyFinder([]uint{1, 4})
+	kf = newKeyFinder([]uint{1, 4}, nil)
 	tooShorts := []string{"a", "a b", "a b c"}
 	for _, tooShortString := range tooShorts {
 		tooShort := []byte(tooShortString)

diff --git a/internal/runner.go b/internal/runner.go
@@ -8,7 +8,7 @@ import (
 
 func Run(config *config, instream io.Reader) ([]*keyCount, error) {
 	// lifted out of main.go to facilitate testing
-	var kf = newKeyFinder(config.fields)
+	var kf = newKeyFinder(config.fields, config.fieldSeparator)
 	var topList []*keyCount
 	var err error
 

diff --git a/internal/segmenter_test.go b/internal/segmenter_test.go
@@ -21,7 +21,7 @@ func TestReadAll(t *testing.T) {
 		t.Error("OUCH")
 	}
 	s := segment{4176, 4951, file}
-	kf := newKeyFinder([]uint{7})
+	kf := newKeyFinder([]uint{7}, nil)
 	ch := make(chan segmentResult)
 	f := filters{nil, nil, nil}
 	go readSegment(&s, &f, kf, ch)
@@ -77,7 +77,7 @@ func TestReadSegmentFiltering(t *testing.T) {
 	_, _ = fmt.Fprint(tmpfile, input)
 	_ = tmpfile.Close()
 	counter := newCounter(10)
-	err = readFileInSegments(tmpName, &c.filter, counter, newKeyFinder(c.fields), 1)
+	err = readFileInSegments(tmpName, &c.filter, counter, newKeyFinder(c.fields, nil), 1)
 	if err != nil {
 		t.Error("Run? " + err.Error())
 	}
@@ -110,7 +110,7 @@ func TestVeryLongLines(t *testing.T) {
 	}
 	_ = tmpfile.Close()
 	counter := newCounter(10)
-	err = readFileInSegments(tmpName, &filters{}, counter, newKeyFinder(nil), 1)
+	err = readFileInSegments(tmpName, &filters{}, counter, newKeyFinder(nil, nil), 1)
 	if err != nil {
 		t.Fatal("Failed to read long-lines file")
 	}

diff --git a/internal/stream_test.go b/internal/stream_test.go
@@ -48,7 +48,7 @@ func Test1KLinesStream(t *testing.T) {
 	//noinspection ALL
 	defer file.Close()
 
-	kf := newKeyFinder([]uint{1})
+	kf := newKeyFinder([]uint{1}, nil)
 	f := filters{nil, nil, nil}
 	x, err := fromStream(file, &f, kf, 5)
 	if err != nil {