From e2338195bafaf46d58ddf956c6af88e4fd53c32e Mon Sep 17 00:00:00 2001 From: "Mr. Lance E Sloan" <17595351+sloanlance@users.noreply.github.com> Date: Wed, 23 Aug 2023 16:08:48 -0400 Subject: [PATCH] filename options for `split` (iss. #1365) (#1366) * #1365 - filename options for `split` * Don't use joiner string when prefix is empty. * Add option to specify joiner string. * Add option to not URL-escape file names. * #1365 - update documentation * #1365 - don't URL-escape file name prefix I **_thought_** it'd be cool to apply URL-escaping to the file name prefix as well, just in case it included spaces or other characters. I forgot that a common use for the prefix is to specify a directory path that will contain the file. When the slashes ("`/`") of the path are URL-escaped, they become "`%2F`" and the directories will not be created. So, I moved the prefix handling code to come after the URL-escaping. * #1365 - new `split` options for CLI help output * #1365 - fix escape/suffix logic error Trying to make the `return` statement cleaner, I thought it'd be good to add the file name suffix immediately after the file name is URL-escaped. I'd forgotten that the suffix will not be added if the new `-e` option is used to skip URL-escaping. So, I put the suffix back where I had it. * #1365 - add `split` to the "10 minutes" document Not strictly part of this issue, but as I was checking for docs that I should update as a result of my changes, I noticed this document showed how to split data using the `put` and `tee` combination, but not about the `split` verb. * #1365 - updated manpage When I ran `make dev`, generating `data-diving-examples.md` failed. The two `manpage.txt` files ended up empty, but `mlr.1` seems to be correct. --------- Co-authored-by: Mr. Lance E Sloan (sloanlance) --- docs/src/10min.md | 37 +++++++++++++++ docs/src/10min.md.in | 18 ++++++++ docs/src/manpage.txt | 2 + internal/pkg/transformers/split.go | 74 ++++++++++++++++++++---------- man/mlr.1 | 6 ++- test/cases/cli-help/0001/expout | 2 + 6 files changed, 113 insertions(+), 26 deletions(-) diff --git a/docs/src/10min.md b/docs/src/10min.md index 33f7252dab..d9e4d24163 100644 --- a/docs/src/10min.md +++ b/docs/src/10min.md @@ -909,3 +909,40 @@ yellow,triangle,true,1,11,43.6498,9.8870 purple,triangle,false,5,51,81.2290,8.5910 purple,triangle,false,7,65,80.1405,5.8240 + +Alternatively, the `split` verb can do the same thing: + +
+mlr --csv --from example.csv split -g shape
+
+ +
+cat split_circle.csv
+
+
+color,shape,flag,k,index,quantity,rate
+red,circle,true,3,16,13.8103,2.9010
+yellow,circle,true,8,73,63.9785,4.2370
+yellow,circle,true,9,87,63.5058,8.3350
+
+ +
+cat split_square.csv
+
+
+color,shape,flag,k,index,quantity,rate
+red,square,true,2,15,79.2778,0.0130
+red,square,false,4,48,77.5542,7.4670
+red,square,false,6,64,77.1991,9.5310
+purple,square,false,10,91,72.3735,8.2430
+
+ +
+cat split_triangle.csv
+
+
+color,shape,flag,k,index,quantity,rate
+yellow,triangle,true,1,11,43.6498,9.8870
+purple,triangle,false,5,51,81.2290,8.5910
+purple,triangle,false,7,65,80.1405,5.8240
+
diff --git a/docs/src/10min.md.in b/docs/src/10min.md.in index 7a0696c87e..0fdc94bf1f 100644 --- a/docs/src/10min.md.in +++ b/docs/src/10min.md.in @@ -434,3 +434,21 @@ GENMD-EOF GENMD-RUN-COMMAND cat triangle.csv GENMD-EOF + +Alternatively, the `split` verb can do the same thing: + +GENMD-RUN-COMMAND +mlr --csv --from example.csv split -g shape +GENMD-EOF + +GENMD-RUN-COMMAND +cat split_circle.csv +GENMD-EOF + +GENMD-RUN-COMMAND +cat split_square.csv +GENMD-EOF + +GENMD-RUN-COMMAND +cat split_triangle.csv +GENMD-EOF diff --git a/docs/src/manpage.txt b/docs/src/manpage.txt index 0c04fc330e..48497719b5 100644 --- a/docs/src/manpage.txt +++ b/docs/src/manpage.txt @@ -1813,6 +1813,8 @@ MILLER(1) MILLER(1) --suffix {s} Specify filename suffix; default is from mlr output format, e.g. "csv". -a Append to existing file(s), if any, rather than overwriting. -v Send records along to downstream verbs as well as splitting to files. + -e Do NOT URL-escape names of output files. + -j {j} Use string J to join filename parts; default "_". -h|--help Show this message. Any of the output-format command-line flags (see mlr -h). For example, using mlr --icsv --from myfile.csv split --ojson -n 1000 diff --git a/internal/pkg/transformers/split.go b/internal/pkg/transformers/split.go index 579aa09708..6e702d4fd4 100644 --- a/internal/pkg/transformers/split.go +++ b/internal/pkg/transformers/split.go @@ -1,7 +1,6 @@ package transformers import ( - "bytes" "container/list" "fmt" "net/url" @@ -17,6 +16,7 @@ import ( // ---------------------------------------------------------------- const verbNameSplit = "split" const splitDefaultOutputFileNamePrefix = "split" +const splitDefaultFileNamePartJoiner = "_" var SplitSetup = TransformerSetup{ Verb: verbNameSplit, @@ -39,6 +39,8 @@ Exactly one of -m, -n, or -g must be supplied. --suffix {s} Specify filename suffix; default is from mlr output format, e.g. "csv". -a Append to existing file(s), if any, rather than overwriting. -v Send records along to downstream verbs as well as splitting to files. +-e Do NOT URL-escape names of output files. +-j {j} Use string J to join filename parts; default "`+splitDefaultFileNamePartJoiner+`". -h|--help Show this message. Any of the output-format command-line flags (see mlr -h). For example, using mlr --icsv --from myfile.csv split --ojson -n 1000 @@ -88,6 +90,8 @@ func transformerSplitParseCLI( var doSize bool = false var groupByFieldNames []string = nil var emitDownstream bool = false + var escapeFileNameCharacters bool = true + var fileNamePartJoiner string = splitDefaultFileNamePartJoiner var doAppend bool = false var outputFileNamePrefix string = splitDefaultOutputFileNamePrefix var outputFileNameSuffix string = "uninit" @@ -138,6 +142,12 @@ func transformerSplitParseCLI( } else if opt == "-v" { emitDownstream = true + } else if opt == "-e" { + escapeFileNameCharacters = false + + } else if opt == "-j" { + fileNamePartJoiner = cli.VerbGetStringArgOrDie(verb, opt, args, &argi, argc) + } else { // This is inelegant. For error-proofing we advance argi already in our // loop (so individual if-statements don't need to). However, @@ -180,6 +190,8 @@ func transformerSplitParseCLI( doSize, groupByFieldNames, emitDownstream, + escapeFileNameCharacters, + fileNamePartJoiner, doAppend, outputFileNamePrefix, outputFileNameSuffix, @@ -195,14 +207,16 @@ func transformerSplitParseCLI( // ---------------------------------------------------------------- type TransformerSplit struct { - n int64 - outputFileNamePrefix string - outputFileNameSuffix string - emitDownstream bool - ungroupedCounter int64 - groupByFieldNames []string - recordWriterOptions *cli.TWriterOptions - doAppend bool + n int64 + outputFileNamePrefix string + outputFileNameSuffix string + emitDownstream bool + escapeFileNameCharacters bool + fileNamePartJoiner string + ungroupedCounter int64 + groupByFieldNames []string + recordWriterOptions *cli.TWriterOptions + doAppend bool // For doSize ungrouped: only one file open at a time outputHandler output.OutputHandler @@ -220,6 +234,8 @@ func NewTransformerSplit( doSize bool, groupByFieldNames []string, emitDownstream bool, + escapeFileNameCharacters bool, + fileNamePartJoiner string, doAppend bool, outputFileNamePrefix string, outputFileNameSuffix string, @@ -227,14 +243,16 @@ func NewTransformerSplit( ) (*TransformerSplit, error) { tr := &TransformerSplit{ - n: n, - outputFileNamePrefix: outputFileNamePrefix, - outputFileNameSuffix: outputFileNameSuffix, - emitDownstream: emitDownstream, - ungroupedCounter: 0, - groupByFieldNames: groupByFieldNames, - recordWriterOptions: recordWriterOptions, - doAppend: doAppend, + n: n, + outputFileNamePrefix: outputFileNamePrefix, + outputFileNameSuffix: outputFileNameSuffix, + emitDownstream: emitDownstream, + escapeFileNameCharacters: escapeFileNameCharacters, + fileNamePartJoiner: fileNamePartJoiner, + ungroupedCounter: 0, + groupByFieldNames: groupByFieldNames, + recordWriterOptions: recordWriterOptions, + doAppend: doAppend, outputHandler: nil, previousQuotient: -1, @@ -402,13 +420,21 @@ func (tr *TransformerSplit) makeUngroupedOutputFileName(k int64) string { func (tr *TransformerSplit) makeGroupedOutputFileName( groupByFieldValues []*mlrval.Mlrval, ) string { - var buffer bytes.Buffer - buffer.WriteString(tr.outputFileNamePrefix) + var fileNameParts []string + for _, groupByFieldValue := range groupByFieldValues { - buffer.WriteString("_") - buffer.WriteString(url.QueryEscape(groupByFieldValue.String())) + fileNameParts = append(fileNameParts, groupByFieldValue.String()) } - buffer.WriteString(".") - buffer.WriteString(tr.outputFileNameSuffix) - return buffer.String() + + fileName := strings.Join(fileNameParts, tr.fileNamePartJoiner) + + if tr.escapeFileNameCharacters { + fileName = url.QueryEscape(fileName) + } + + if tr.outputFileNamePrefix != "" { + fileName = tr.outputFileNamePrefix + tr.fileNamePartJoiner + fileName + } + + return fileName + "." + tr.outputFileNameSuffix } diff --git a/man/mlr.1 b/man/mlr.1 index ab56c69bb3..c14251a75e 100644 --- a/man/mlr.1 +++ b/man/mlr.1 @@ -2,12 +2,12 @@ .\" Title: mlr .\" Author: [see the "AUTHOR" section] .\" Generator: ./mkman.rb -.\" Date: 2023-08-20 +.\" Date: 2023-08-22 .\" Manual: \ \& .\" Source: \ \& .\" Language: English .\" -.TH "MILLER" "1" "2023-08-20" "\ \&" "\ \&" +.TH "MILLER" "1" "2023-08-22" "\ \&" "\ \&" .\" ----------------------------------------------------------------- .\" * Portability definitions .\" ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -2296,6 +2296,8 @@ Exactly one of -m, -n, or -g must be supplied. --suffix {s} Specify filename suffix; default is from mlr output format, e.g. "csv". -a Append to existing file(s), if any, rather than overwriting. -v Send records along to downstream verbs as well as splitting to files. +-e Do NOT URL-escape names of output files. +-j {j} Use string J to join filename parts; default "_". -h|--help Show this message. Any of the output-format command-line flags (see mlr -h). For example, using mlr --icsv --from myfile.csv split --ojson -n 1000 diff --git a/test/cases/cli-help/0001/expout b/test/cases/cli-help/0001/expout index 55efea8ac7..14772570b6 100644 --- a/test/cases/cli-help/0001/expout +++ b/test/cases/cli-help/0001/expout @@ -997,6 +997,8 @@ Exactly one of -m, -n, or -g must be supplied. --suffix {s} Specify filename suffix; default is from mlr output format, e.g. "csv". -a Append to existing file(s), if any, rather than overwriting. -v Send records along to downstream verbs as well as splitting to files. +-e Do NOT URL-escape names of output files. +-j {j} Use string J to join filename parts; default "_". -h|--help Show this message. Any of the output-format command-line flags (see mlr -h). For example, using mlr --icsv --from myfile.csv split --ojson -n 1000