From feaf0e2fb6f70fb11f2c8e5532e5156afa0ff3c5 Mon Sep 17 00:00:00 2001 From: John Kerl Date: Sat, 17 Feb 2024 12:15:09 -0500 Subject: [PATCH] mlr sparsify --- docs/src/reference-verbs.md.in | 6 + pkg/transformers/aaa_transformer_table.go | 1 + pkg/transformers/sparsify.go | 194 ++++++++++++++++++++++ test/cases/cli-help/0001/expout | 14 ++ test/input/sparsify-input.csv | 5 + 5 files changed, 220 insertions(+) create mode 100644 pkg/transformers/sparsify.go create mode 100644 test/input/sparsify-input.csv diff --git a/docs/src/reference-verbs.md.in b/docs/src/reference-verbs.md.in index 44feda3deb..8959ebf6bb 100644 --- a/docs/src/reference-verbs.md.in +++ b/docs/src/reference-verbs.md.in @@ -995,6 +995,12 @@ GENMD-RUN-COMMAND mlr --ijson --opprint sort-within-records data/sort-within-records.json GENMD-EOF +## sparsify + +GENMD-RUN-COMMAND +mlr sparsify --help +GENMD-EOF + ## split GENMD-RUN-COMMAND diff --git a/pkg/transformers/aaa_transformer_table.go b/pkg/transformers/aaa_transformer_table.go index ed98af07f2..34a5b6ea85 100644 --- a/pkg/transformers/aaa_transformer_table.go +++ b/pkg/transformers/aaa_transformer_table.go @@ -62,6 +62,7 @@ var TRANSFORMER_LOOKUP_TABLE = []TransformerSetup{ SkipTrivialRecordsSetup, SortSetup, SortWithinRecordsSetup, + SparsifySetup, SplitSetup, SsubSetup, Stats1Setup, diff --git a/pkg/transformers/sparsify.go b/pkg/transformers/sparsify.go new file mode 100644 index 0000000000..be1a6de73a --- /dev/null +++ b/pkg/transformers/sparsify.go @@ -0,0 +1,194 @@ +package transformers + +import ( + "container/list" + "fmt" + "os" + "strings" + + "github.com/johnkerl/miller/pkg/cli" + "github.com/johnkerl/miller/pkg/lib" + "github.com/johnkerl/miller/pkg/mlrval" + "github.com/johnkerl/miller/pkg/types" +) + +// ---------------------------------------------------------------- +const verbNameSparsify = "sparsify" + +var SparsifySetup = TransformerSetup{ + Verb: verbNameSparsify, + UsageFunc: transformerSparsifyUsage, + ParseCLIFunc: transformerSparsifyParseCLI, + IgnoresInput: false, +} + +func transformerSparsifyUsage( + o *os.File, +) { + fmt.Fprintf(o, "Usage: %s %s [options]\n", "mlr", verbNameSparsify) + fmt.Fprint(o, + `Unsets fields for which the key is the empty string (or, optionally, another +specified value). Only makes sense with output format not being CSV or TSV. +`) + + fmt.Fprintf(o, "Options:\n") + fmt.Fprintf(o, "-s {filler string} What values to remove. Defaults to the empty string.\n") + fmt.Fprintf(o, "-f {a,b,c} Specify field names to be operated on; any other fields won't be\n") + fmt.Fprintf(o, " modified. The default is to modify all fields.\n") + fmt.Fprintf(o, "-h|--help Show this message.\n") + + fmt.Fprint(o, + `Example: if the input is two records, one being 'a=1,b=2' and the other +being 'b=3,c=4', then the output is the two records 'a=1,b=2,c=' and +'a=,b=3,c=4'. +`) +} + +func transformerSparsifyParseCLI( + pargi *int, + argc int, + args []string, + _ *cli.TOptions, + doConstruct bool, // false for first pass of CLI-parse, true for second pass +) IRecordTransformer { + + // Skip the verb name from the current spot in the mlr command line + argi := *pargi + verb := args[argi] + argi++ + + fillerString := "" + var specifiedFieldNames []string = nil + + for argi < argc /* variable increment: 1 or 2 depending on flag */ { + opt := args[argi] + if !strings.HasPrefix(opt, "-") { + break // No more flag options to process + } + if args[argi] == "--" { + break // All transformers must do this so main-flags can follow verb-flags + } + argi++ + + if opt == "-h" || opt == "--help" { + transformerSparsifyUsage(os.Stdout) + os.Exit(0) + + } else if opt == "-s" { + fillerString = cli.VerbGetStringArgOrDie(verb, opt, args, &argi, argc) + + } else if opt == "-f" { + specifiedFieldNames = cli.VerbGetStringArrayArgOrDie(verb, opt, args, &argi, argc) + + } else { + transformerSparsifyUsage(os.Stderr) + os.Exit(1) + } + } + + *pargi = argi + if !doConstruct { // All transformers must do this for main command-line parsing + return nil + } + + transformer, err := NewTransformerSparsify( + fillerString, + specifiedFieldNames, + ) + if err != nil { + fmt.Fprintln(os.Stderr, err) + os.Exit(1) + } + + return transformer +} + +// ---------------------------------------------------------------- +type TransformerSparsify struct { + fillerString string + fieldNamesSet map[string]bool + recordTransformerFunc RecordTransformerFunc +} + +func NewTransformerSparsify( + fillerString string, + specifiedFieldNames []string, +) (*TransformerSparsify, error) { + + tr := &TransformerSparsify{ + fillerString: fillerString, + fieldNamesSet: lib.StringListToSet(specifiedFieldNames), + } + if specifiedFieldNames == nil { + tr.recordTransformerFunc = tr.transformAll + } else { + tr.recordTransformerFunc = tr.transformSome + } + + return tr, nil +} + +func (tr *TransformerSparsify) Transform( + inrecAndContext *types.RecordAndContext, + outputRecordsAndContexts *list.List, // list of *types.RecordAndContext + inputDownstreamDoneChannel <-chan bool, + outputDownstreamDoneChannel chan<- bool, +) { + HandleDefaultDownstreamDone(inputDownstreamDoneChannel, outputDownstreamDoneChannel) + + if !inrecAndContext.EndOfStream { + tr.recordTransformerFunc( + inrecAndContext, + outputRecordsAndContexts, + inputDownstreamDoneChannel, + outputDownstreamDoneChannel, + ) + } else { + outputRecordsAndContexts.PushBack(inrecAndContext) // end-of-stream marker + } +} + +func (tr *TransformerSparsify) transformAll( + inrecAndContext *types.RecordAndContext, + outputRecordsAndContexts *list.List, // list of *types.RecordAndContext + inputDownstreamDoneChannel <-chan bool, + outputDownstreamDoneChannel chan<- bool, +) { + inrec := inrecAndContext.Record + outrec := mlrval.NewMlrmapAsRecord() + + for pe := inrec.Head; pe != nil; pe = pe.Next { + if pe.Value.String() != tr.fillerString { + // Reference OK because ownership transfer + outrec.PutReference(pe.Key, pe.Value) + } + } + + outrecAndContext := types.NewRecordAndContext(outrec, &inrecAndContext.Context) + outputRecordsAndContexts.PushBack(outrecAndContext) +} + +// ---------------------------------------------------------------- +func (tr *TransformerSparsify) transformSome( + inrecAndContext *types.RecordAndContext, + outputRecordsAndContexts *list.List, // list of *types.RecordAndContext + inputDownstreamDoneChannel <-chan bool, + outputDownstreamDoneChannel chan<- bool, +) { + inrec := inrecAndContext.Record + outrec := mlrval.NewMlrmapAsRecord() + + for pe := inrec.Head; pe != nil; pe = pe.Next { + if tr.fieldNamesSet[pe.Key] { + if pe.Value.String() != tr.fillerString { + // Reference OK because ownership transfer + outrec.PutReference(pe.Key, pe.Value) + } + } else { + outrec.PutReference(pe.Key, pe.Value) + } + } + + outrecAndContext := types.NewRecordAndContext(outrec, &inrecAndContext.Context) + outputRecordsAndContexts.PushBack(outrecAndContext) +} diff --git a/test/cases/cli-help/0001/expout b/test/cases/cli-help/0001/expout index b25e4a56d1..07ca9d0961 100644 --- a/test/cases/cli-help/0001/expout +++ b/test/cases/cli-help/0001/expout @@ -988,6 +988,20 @@ Options: -r Recursively sort subobjects/submaps, e.g. for JSON input. -h|--help Show this message. +================================================================ +sparsify +Usage: mlr sparsify [options] +Unsets fields for which the key is the empty string (or, optionally, another +specified value). Only makes sense with output format not being CSV or TSV. +Options: +-s {filler string} What values to remove. Defaults to the empty string. +-f {a,b,c} Specify field names to be operated on; any other fields won't be + modified. The default is to modify all fields. +-h|--help Show this message. +Example: if the input is two records, one being 'a=1,b=2' and the other +being 'b=3,c=4', then the output is the two records 'a=1,b=2,c=' and +'a=,b=3,c=4'. + ================================================================ split Usage: mlr split [options] {filename} diff --git a/test/input/sparsify-input.csv b/test/input/sparsify-input.csv new file mode 100644 index 0000000000..16916596e0 --- /dev/null +++ b/test/input/sparsify-input.csv @@ -0,0 +1,5 @@ +a,b,c +1,2,3 +4,5, +,, +7,8,9