From e2338195bafaf46d58ddf956c6af88e4fd53c32e Mon Sep 17 00:00:00 2001
From: "Mr. Lance E Sloan" <17595351+sloanlance@users.noreply.github.com>
Date: Wed, 23 Aug 2023 16:08:48 -0400
Subject: [PATCH] filename options for `split` (iss. #1365) (#1366)

* #1365 - filename options for `split`

* Don't use joiner string when prefix is empty.
* Add option to specify joiner string.
* Add option to not URL-escape file names.

* #1365 - update documentation

* #1365 - don't URL-escape file name prefix

I **_thought_** it'd be cool to apply URL-escaping to the file name prefix as well, just in case it included spaces or other characters.  I forgot that a common use for the prefix is to specify a directory path that will contain the file.  When the slashes ("`/`") of the path are URL-escaped, they become "`%2F`" and the directories will not be created.  So, I moved the prefix handling code to come after the URL-escaping.

* #1365 - new `split` options for CLI help output

* #1365 - fix escape/suffix logic error

Trying to make the `return` statement cleaner, I thought it'd be good to add the file name suffix immediately after the file name is URL-escaped.  I'd forgotten that the suffix will not be added if the new `-e` option is used to skip URL-escaping.  So, I put the suffix back where I had it.

* #1365 - add `split` to the "10 minutes" document

Not strictly part of this issue, but as I was checking for docs that I should update as a result of my changes, I noticed this document showed how to split data using the `put` and `tee` combination, but not about the `split` verb.

* #1365 - updated manpage

When I ran `make dev`, generating `data-diving-examples.md` failed.  The two `manpage.txt` files ended up empty, but `mlr.1` seems to be correct.

---------

Co-authored-by: Mr. Lance E Sloan (sloanlance) <sloanlance@users.noreply.github.com>
---
 docs/src/10min.md                  | 37 +++++++++++++++
 docs/src/10min.md.in               | 18 ++++++++
 docs/src/manpage.txt               |  2 +
 internal/pkg/transformers/split.go | 74 ++++++++++++++++++++----------
 man/mlr.1                          |  6 ++-
 test/cases/cli-help/0001/expout    |  2 +
 6 files changed, 113 insertions(+), 26 deletions(-)
diff --git a/docs/src/10min.md b/docs/src/10min.md
index 33f7252dab..d9e4d24163 100644
--- a/docs/src/10min.md
+++ b/docs/src/10min.md
@@ -909,3 +909,40 @@ yellow,triangle,true,1,11,43.6498,9.8870
 purple,triangle,false,5,51,81.2290,8.5910
 purple,triangle,false,7,65,80.1405,5.8240
 </pre>
+
+Alternatively, the `split` verb can do the same thing:
+
+<pre class="pre-highlight-non-pair">
+<b>mlr --csv --from example.csv split -g shape</b>
+</pre>
+
+<pre class="pre-highlight-in-pair">
+<b>cat split_circle.csv</b>
+</pre>
+<pre class="pre-non-highlight-in-pair">
+color,shape,flag,k,index,quantity,rate
+red,circle,true,3,16,13.8103,2.9010
+yellow,circle,true,8,73,63.9785,4.2370
+yellow,circle,true,9,87,63.5058,8.3350
+</pre>
+
+<pre class="pre-highlight-in-pair">
+<b>cat split_square.csv</b>
+</pre>
+<pre class="pre-non-highlight-in-pair">
+color,shape,flag,k,index,quantity,rate
+red,square,true,2,15,79.2778,0.0130
+red,square,false,4,48,77.5542,7.4670
+red,square,false,6,64,77.1991,9.5310
+purple,square,false,10,91,72.3735,8.2430
+</pre>
+
+<pre class="pre-highlight-in-pair">
+<b>cat split_triangle.csv</b>
+</pre>
+<pre class="pre-non-highlight-in-pair">
+color,shape,flag,k,index,quantity,rate
+yellow,triangle,true,1,11,43.6498,9.8870
+purple,triangle,false,5,51,81.2290,8.5910
+purple,triangle,false,7,65,80.1405,5.8240
+</pre>
diff --git a/docs/src/10min.md.in b/docs/src/10min.md.in
index 7a0696c87e..0fdc94bf1f 100644
--- a/docs/src/10min.md.in
+++ b/docs/src/10min.md.in
@@ -434,3 +434,21 @@ GENMD-EOF
 GENMD-RUN-COMMAND
 cat triangle.csv
 GENMD-EOF
+
+Alternatively, the `split` verb can do the same thing:
+
+GENMD-RUN-COMMAND
+mlr --csv --from example.csv split -g shape
+GENMD-EOF
+
+GENMD-RUN-COMMAND
+cat split_circle.csv
+GENMD-EOF
+
+GENMD-RUN-COMMAND
+cat split_square.csv
+GENMD-EOF
+
+GENMD-RUN-COMMAND
+cat split_triangle.csv
+GENMD-EOF
diff --git a/docs/src/manpage.txt b/docs/src/manpage.txt
index 0c04fc330e..48497719b5 100644
--- a/docs/src/manpage.txt
+++ b/docs/src/manpage.txt
@@ -1813,6 +1813,8 @@ MILLER(1)                                                            MILLER(1)
        --suffix {s} Specify filename suffix; default is from mlr output format, e.g. "csv".
        -a           Append to existing file(s), if any, rather than overwriting.
        -v           Send records along to downstream verbs as well as splitting to files.
+       -e           Do NOT URL-escape names of output files.
+       -j {j}       Use string J to join filename parts; default "_".
        -h|--help    Show this message.
        Any of the output-format command-line flags (see mlr -h). For example, using
          mlr --icsv --from myfile.csv split --ojson -n 1000
diff --git a/internal/pkg/transformers/split.go b/internal/pkg/transformers/split.go
index 579aa09708..6e702d4fd4 100644
--- a/internal/pkg/transformers/split.go
+++ b/internal/pkg/transformers/split.go
@@ -1,7 +1,6 @@
 package transformers
 
 import (
-	"bytes"
 	"container/list"
 	"fmt"
 	"net/url"
@@ -17,6 +16,7 @@ import (
 // ----------------------------------------------------------------
 const verbNameSplit = "split"
 const splitDefaultOutputFileNamePrefix = "split"
+const splitDefaultFileNamePartJoiner = "_"
 
 var SplitSetup = TransformerSetup{
 	Verb:         verbNameSplit,
@@ -39,6 +39,8 @@ Exactly one  of -m, -n, or -g must be supplied.
 --suffix {s} Specify filename suffix; default is from mlr output format, e.g. "csv".
 -a           Append to existing file(s), if any, rather than overwriting.
 -v           Send records along to downstream verbs as well as splitting to files.
+-e           Do NOT URL-escape names of output files.
+-j {j}       Use string J to join filename parts; default "`+splitDefaultFileNamePartJoiner+`".
 -h|--help    Show this message.
 Any of the output-format command-line flags (see mlr -h). For example, using
   mlr --icsv --from myfile.csv split --ojson -n 1000
@@ -88,6 +90,8 @@ func transformerSplitParseCLI(
 	var doSize bool = false
 	var groupByFieldNames []string = nil
 	var emitDownstream bool = false
+	var escapeFileNameCharacters bool = true
+	var fileNamePartJoiner string = splitDefaultFileNamePartJoiner
 	var doAppend bool = false
 	var outputFileNamePrefix string = splitDefaultOutputFileNamePrefix
 	var outputFileNameSuffix string = "uninit"
@@ -138,6 +142,12 @@ func transformerSplitParseCLI(
 		} else if opt == "-v" {
 			emitDownstream = true
 
+		} else if opt == "-e" {
+			escapeFileNameCharacters = false
+
+		} else if opt == "-j" {
+			fileNamePartJoiner = cli.VerbGetStringArgOrDie(verb, opt, args, &argi, argc)
+
 		} else {
 			// This is inelegant. For error-proofing we advance argi already in our
 			// loop (so individual if-statements don't need to). However,
@@ -180,6 +190,8 @@ func transformerSplitParseCLI(
 		doSize,
 		groupByFieldNames,
 		emitDownstream,
+		escapeFileNameCharacters,
+		fileNamePartJoiner,
 		doAppend,
 		outputFileNamePrefix,
 		outputFileNameSuffix,
@@ -195,14 +207,16 @@ func transformerSplitParseCLI(
 
 // ----------------------------------------------------------------
 type TransformerSplit struct {
-	n                    int64
-	outputFileNamePrefix string
-	outputFileNameSuffix string
-	emitDownstream       bool
-	ungroupedCounter     int64
-	groupByFieldNames    []string
-	recordWriterOptions  *cli.TWriterOptions
-	doAppend             bool
+	n                        int64
+	outputFileNamePrefix     string
+	outputFileNameSuffix     string
+	emitDownstream           bool
+	escapeFileNameCharacters bool
+	fileNamePartJoiner       string
+	ungroupedCounter         int64
+	groupByFieldNames        []string
+	recordWriterOptions      *cli.TWriterOptions
+	doAppend                 bool
 
 	// For doSize ungrouped: only one file open at a time
 	outputHandler    output.OutputHandler
@@ -220,6 +234,8 @@ func NewTransformerSplit(
 	doSize bool,
 	groupByFieldNames []string,
 	emitDownstream bool,
+	escapeFileNameCharacters bool,
+	fileNamePartJoiner string,
 	doAppend bool,
 	outputFileNamePrefix string,
 	outputFileNameSuffix string,
@@ -227,14 +243,16 @@ func NewTransformerSplit(
 ) (*TransformerSplit, error) {
 
 	tr := &TransformerSplit{
-		n:                    n,
-		outputFileNamePrefix: outputFileNamePrefix,
-		outputFileNameSuffix: outputFileNameSuffix,
-		emitDownstream:       emitDownstream,
-		ungroupedCounter:     0,
-		groupByFieldNames:    groupByFieldNames,
-		recordWriterOptions:  recordWriterOptions,
-		doAppend:             doAppend,
+		n:                        n,
+		outputFileNamePrefix:     outputFileNamePrefix,
+		outputFileNameSuffix:     outputFileNameSuffix,
+		emitDownstream:           emitDownstream,
+		escapeFileNameCharacters: escapeFileNameCharacters,
+		fileNamePartJoiner:       fileNamePartJoiner,
+		ungroupedCounter:         0,
+		groupByFieldNames:        groupByFieldNames,
+		recordWriterOptions:      recordWriterOptions,
+		doAppend:                 doAppend,
 
 		outputHandler:    nil,
 		previousQuotient: -1,
@@ -402,13 +420,21 @@ func (tr *TransformerSplit) makeUngroupedOutputFileName(k int64) string {
 func (tr *TransformerSplit) makeGroupedOutputFileName(
 	groupByFieldValues []*mlrval.Mlrval,
 ) string {
-	var buffer bytes.Buffer
-	buffer.WriteString(tr.outputFileNamePrefix)
+	var fileNameParts []string
+
 	for _, groupByFieldValue := range groupByFieldValues {
-		buffer.WriteString("_")
-		buffer.WriteString(url.QueryEscape(groupByFieldValue.String()))
+		fileNameParts = append(fileNameParts, groupByFieldValue.String())
 	}
-	buffer.WriteString(".")
-	buffer.WriteString(tr.outputFileNameSuffix)
-	return buffer.String()
+
+	fileName := strings.Join(fileNameParts, tr.fileNamePartJoiner)
+
+	if tr.escapeFileNameCharacters {
+		fileName = url.QueryEscape(fileName)
+	}
+
+	if tr.outputFileNamePrefix != "" {
+		fileName = tr.outputFileNamePrefix + tr.fileNamePartJoiner + fileName
+	}
+
+	return fileName + "." + tr.outputFileNameSuffix
 }
diff --git a/man/mlr.1 b/man/mlr.1
index ab56c69bb3..c14251a75e 100644
--- a/man/mlr.1
+++ b/man/mlr.1
@@ -2,12 +2,12 @@
 .\"     Title: mlr
 .\"    Author: [see the "AUTHOR" section]
 .\" Generator: ./mkman.rb
-.\"      Date: 2023-08-20
+.\"      Date: 2023-08-22
 .\"    Manual: \ \&
 .\"    Source: \ \&
 .\"  Language: English
 .\"
-.TH "MILLER" "1" "2023-08-20" "\ \&" "\ \&"
+.TH "MILLER" "1" "2023-08-22" "\ \&" "\ \&"
 .\" -----------------------------------------------------------------
 .\" * Portability definitions
 .\" ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -2296,6 +2296,8 @@ Exactly one  of -m, -n, or -g must be supplied.
 --suffix {s} Specify filename suffix; default is from mlr output format, e.g. "csv".
 -a           Append to existing file(s), if any, rather than overwriting.
 -v           Send records along to downstream verbs as well as splitting to files.
+-e           Do NOT URL-escape names of output files.
+-j {j}       Use string J to join filename parts; default "_".
 -h|--help    Show this message.
 Any of the output-format command-line flags (see mlr -h). For example, using
   mlr --icsv --from myfile.csv split --ojson -n 1000
diff --git a/test/cases/cli-help/0001/expout b/test/cases/cli-help/0001/expout
index 55efea8ac7..14772570b6 100644
--- a/test/cases/cli-help/0001/expout
+++ b/test/cases/cli-help/0001/expout
@@ -997,6 +997,8 @@ Exactly one  of -m, -n, or -g must be supplied.
 --suffix {s} Specify filename suffix; default is from mlr output format, e.g. "csv".
 -a           Append to existing file(s), if any, rather than overwriting.
 -v           Send records along to downstream verbs as well as splitting to files.
+-e           Do NOT URL-escape names of output files.
+-j {j}       Use string J to join filename parts; default "_".
 -h|--help    Show this message.
 Any of the output-format command-line flags (see mlr -h). For example, using
   mlr --icsv --from myfile.csv split --ojson -n 1000