From 183a42c249dd73651c1be8346dae087faaf271da Mon Sep 17 00:00:00 2001 From: Mike Farah Date: Sun, 31 Jul 2022 17:31:27 +1000 Subject: [PATCH] WIP: adding CSV decoder --- cmd/utils.go | 4 + pkg/yqlib/csv_test.go | 193 ++++++++++++++++++++++++++++++++ pkg/yqlib/decoder.go | 6 + pkg/yqlib/decoder_csv_object.go | 77 +++++++++++++ pkg/yqlib/doc/usage/csv-tsv.md | 143 +++++++++++++++++++++++ pkg/yqlib/operators_test.go | 2 +- 6 files changed, 424 insertions(+), 1 deletion(-) create mode 100644 pkg/yqlib/csv_test.go create mode 100644 pkg/yqlib/decoder_csv_object.go create mode 100644 pkg/yqlib/doc/usage/csv-tsv.md diff --git a/cmd/utils.go b/cmd/utils.go index e7750e86..e0aaf1d6 100644 --- a/cmd/utils.go +++ b/cmd/utils.go @@ -68,6 +68,10 @@ func configureDecoder() (yqlib.Decoder, error) { return yqlib.NewPropertiesDecoder(), nil case yqlib.JsonInputFormat: return yqlib.NewJSONDecoder(), nil + case yqlib.CSVObjectInputFormat: + return yqlib.NewCSVObjectDecoder(','), nil + case yqlib.TSVObjectInputFormat: + return yqlib.NewCSVObjectDecoder('\t'), nil } return yqlib.NewYamlDecoder(), nil diff --git a/pkg/yqlib/csv_test.go b/pkg/yqlib/csv_test.go new file mode 100644 index 00000000..480ee1d3 --- /dev/null +++ b/pkg/yqlib/csv_test.go @@ -0,0 +1,193 @@ +package yqlib + +import ( + "bufio" + "fmt" + "testing" + + "github.com/mikefarah/yq/v4/test" +) + +const csvSimple = `name,numberOfCats,likesApples,height +Gary,1,true,168.8 +Samantha's Rabbit,2,false,-188.8 +` + +const csvSimpleShort = `Name,Number of Cats +Gary,1 +Samantha's Rabbit,2 +` + +const tsvSimple = `name numberOfCats likesApples height +Gary 1 true 168.8 +Samantha's Rabbit 2 false -188.8 +` + +const expectedYamlFromCSV = `- name: Gary + numberOfCats: 1 + likesApples: true + height: 168.8 +- name: Samantha's Rabbit + numberOfCats: 2 + likesApples: false + height: -188.8 +` + +const csvTestSimpleYaml = `- [i, like, csv] +- [because, excel, is, cool]` + +const csvTestExpectedSimpleCsv = `i,like,csv +because,excel,is,cool +` + +const tsvTestExpectedSimpleCsv = `i like csv +because excel is cool +` + +var csvScenarios = []formatScenario{ + { + description: "Encode CSV simple", + input: csvTestSimpleYaml, + expected: csvTestExpectedSimpleCsv, + scenarioType: "encode-csv", + }, + { + description: "Encode TSV simple", + input: csvTestSimpleYaml, + expected: tsvTestExpectedSimpleCsv, + scenarioType: "encode-tsv", + }, + { + description: "Encode array of objects to csv", + subdescription: "Add the header row manually, then the we convert each object into an array of values - resulting in an array of arrays. Nice thing about this method is you can pick the columns and call the header whatever you like.", + input: expectedYamlFromCSV, + expected: csvSimpleShort, + expression: `[["Name", "Number of Cats"]] + [.[] | [.name, .numberOfCats ]]`, + scenarioType: "encode-csv", + }, + { + description: "Encode array of objects to csv - generic", + subdescription: "This is a little trickier than the previous example - we dynamically work out the $header, and use that to automatically create the value arrays.", + input: expectedYamlFromCSV, + expected: csvSimple, + expression: `(.[0] | keys | .[] ) as $header | [[$header]] + [.[] | [ .[$header] ]]`, + scenarioType: "encode-csv", + }, + { + description: "Parse CSV into an array of objects", + subdescription: "First row is assumed to define the fields", + input: csvSimple, + expected: expectedYamlFromCSV, + scenarioType: "decode-csv-object", + }, + { + description: "Parse TSV into an array of objects", + subdescription: "First row is assumed to define the fields", + input: tsvSimple, + expected: expectedYamlFromCSV, + scenarioType: "decode-tsv-object", + }, +} + +func testCSVScenario(t *testing.T, s formatScenario) { + switch s.scenarioType { + case "encode-csv": + test.AssertResultWithContext(t, s.expected, processFormatScenario(s, NewYamlDecoder(), NewCsvEncoder(',')), s.description) + case "encode-tsv": + test.AssertResultWithContext(t, s.expected, processFormatScenario(s, NewYamlDecoder(), NewCsvEncoder('\t')), s.description) + case "decode-csv-object": + test.AssertResultWithContext(t, s.expected, processFormatScenario(s, NewCSVObjectDecoder(','), NewYamlEncoder(2, false, true, true)), s.description) + case "decode-tsv-object": + test.AssertResultWithContext(t, s.expected, processFormatScenario(s, NewCSVObjectDecoder('\t'), NewYamlEncoder(2, false, true, true)), s.description) + default: + panic(fmt.Sprintf("unhandled scenario type %q", s.scenarioType)) + } +} + +func documentCSVDecodeObjectScenario(t *testing.T, w *bufio.Writer, s formatScenario, formatType string) { + writeOrPanic(w, fmt.Sprintf("## %v\n", s.description)) + + if s.subdescription != "" { + writeOrPanic(w, s.subdescription) + writeOrPanic(w, "\n\n") + } + + writeOrPanic(w, fmt.Sprintf("Given a sample.%v file of:\n", formatType)) + writeOrPanic(w, fmt.Sprintf("```%v\n%v\n```\n", formatType, s.input)) + + writeOrPanic(w, "then\n") + writeOrPanic(w, fmt.Sprintf("```bash\nyq -p=%v sample.%v\n```\n", formatType, formatType)) + writeOrPanic(w, "will output\n") + + separator := ',' + if formatType == "tsv" { + separator = '\t' + } + + writeOrPanic(w, fmt.Sprintf("```yaml\n%v```\n\n", + processFormatScenario(s, NewCSVObjectDecoder(separator), NewYamlEncoder(s.indent, false, true, true))), + ) +} + +func documentCSVEncodeScenario(w *bufio.Writer, s formatScenario, formatType string) { + writeOrPanic(w, fmt.Sprintf("## %v\n", s.description)) + + if s.subdescription != "" { + writeOrPanic(w, s.subdescription) + writeOrPanic(w, "\n\n") + } + + writeOrPanic(w, "Given a sample.yml file of:\n") + writeOrPanic(w, fmt.Sprintf("```yaml\n%v\n```\n", s.input)) + + writeOrPanic(w, "then\n") + + expression := s.expression + + if expression != "" { + writeOrPanic(w, fmt.Sprintf("```bash\nyq -o=%v '%v' sample.yml\n```\n", formatType, expression)) + } else { + writeOrPanic(w, fmt.Sprintf("```bash\nyq -o=%v sample.yml\n```\n", formatType)) + } + writeOrPanic(w, "will output\n") + + separator := ',' + if formatType == "tsv" { + separator = '\t' + } + + writeOrPanic(w, fmt.Sprintf("```%v\n%v```\n\n", formatType, + processFormatScenario(s, NewYamlDecoder(), NewCsvEncoder(separator))), + ) +} + +func documentCSVScenario(t *testing.T, w *bufio.Writer, i interface{}) { + s := i.(formatScenario) + if s.skipDoc { + return + } + switch s.scenarioType { + case "encode-csv": + documentCSVEncodeScenario(w, s, "csv") + case "encode-tsv": + documentCSVEncodeScenario(w, s, "tsv") + case "decode-csv-object": + documentCSVDecodeObjectScenario(t, w, s, "csv") + case "decode-tsv-object": + documentCSVDecodeObjectScenario(t, w, s, "tsv") + + default: + panic(fmt.Sprintf("unhandled scenario type %q", s.scenarioType)) + } +} + +func TestCSVScenarios(t *testing.T) { + for _, tt := range csvScenarios { + testCSVScenario(t, tt) + } + genericScenarios := make([]interface{}, len(csvScenarios)) + for i, s := range csvScenarios { + genericScenarios[i] = s + } + documentScenarios(t, "usage", "csv-tsv", genericScenarios, documentCSVScenario) +} diff --git a/pkg/yqlib/decoder.go b/pkg/yqlib/decoder.go index ecd39af3..adbb0e55 100644 --- a/pkg/yqlib/decoder.go +++ b/pkg/yqlib/decoder.go @@ -15,6 +15,8 @@ const ( PropertiesInputFormat Base64InputFormat JsonInputFormat + CSVObjectInputFormat + TSVObjectInputFormat ) type Decoder interface { @@ -32,6 +34,10 @@ func InputFormatFromString(format string) (InputFormat, error) { return PropertiesInputFormat, nil case "json", "ndjson", "j": return JsonInputFormat, nil + case "csv": + return CSVObjectInputFormat, nil + case "tsv": + return TSVObjectInputFormat, nil default: return 0, fmt.Errorf("unknown format '%v' please use [yaml|xml|props]", format) } diff --git a/pkg/yqlib/decoder_csv_object.go b/pkg/yqlib/decoder_csv_object.go new file mode 100644 index 00000000..20ccb6a9 --- /dev/null +++ b/pkg/yqlib/decoder_csv_object.go @@ -0,0 +1,77 @@ +package yqlib + +import ( + "encoding/csv" + "errors" + "io" + + yaml "gopkg.in/yaml.v3" +) + +type csvObjectDecoder struct { + separator rune + reader csv.Reader + finished bool +} + +func NewCSVObjectDecoder(separator rune) Decoder { + return &csvObjectDecoder{separator: separator} +} + +func (dec *csvObjectDecoder) Init(reader io.Reader) { + dec.reader = *csv.NewReader(reader) + dec.reader.Comma = dec.separator + dec.finished = false +} + +func (dec *csvObjectDecoder) convertToYamlNode(content string) *yaml.Node { + node, err := parseSnippet(content) + if err != nil { + return createScalarNode(content, content) + } + return node +} + +func (dec *csvObjectDecoder) createObject(headerRow []string, contentRow []string) *yaml.Node { + objectNode := &yaml.Node{Kind: yaml.MappingNode, Tag: "!!map"} + + for i, header := range headerRow { + objectNode.Content = append( + objectNode.Content, + createScalarNode(header, header), + dec.convertToYamlNode(contentRow[i])) + } + return objectNode +} + +func (dec *csvObjectDecoder) Decode(rootYamlNode *yaml.Node) error { + if dec.finished { + return io.EOF + } + headerRow, err := dec.reader.Read() + log.Debugf(": headerRow%v", headerRow) + if err != nil { + return err + } + + rootArray := &yaml.Node{Kind: yaml.SequenceNode, Tag: "!!seq"} + + contentRow, err := dec.reader.Read() + + for err == nil && len(contentRow) > 0 { + log.Debugf("Adding contentRow: %v", contentRow) + rootArray.Content = append(rootArray.Content, dec.createObject(headerRow, contentRow)) + contentRow, err = dec.reader.Read() + log.Debugf("Read next contentRow: %v, %v", contentRow, err) + } + if !errors.Is(err, io.EOF) { + return err + } + + log.Debugf("finished, contentRow%v", contentRow) + log.Debugf("err: %v", err) + + rootYamlNode.Kind = yaml.DocumentNode + rootYamlNode.Content = []*yaml.Node{rootArray} + return nil +} diff --git a/pkg/yqlib/doc/usage/csv-tsv.md b/pkg/yqlib/doc/usage/csv-tsv.md new file mode 100644 index 00000000..fb4b6c30 --- /dev/null +++ b/pkg/yqlib/doc/usage/csv-tsv.md @@ -0,0 +1,143 @@ + +{% hint style="warning" %} +Note that versions prior to 4.18 require the 'eval/e' command to be specified. + +`yq e ` +{% endhint %} + +## Encode CSV simple +Given a sample.yml file of: +```yaml +- [i, like, csv] +- [because, excel, is, cool] +``` +then +```bash +yq -o=csv sample.yml +``` +will output +```csv +i,like,csv +because,excel,is,cool +``` + +## Encode TSV simple +Given a sample.yml file of: +```yaml +- [i, like, csv] +- [because, excel, is, cool] +``` +then +```bash +yq -o=tsv sample.yml +``` +will output +```tsv +i like csv +because excel is cool +``` + +## Encode array of objects to csv +Add the header row manually, then the we convert each object into an array of values - resulting in an array of arrays. Nice thing about this method is you can pick the columns and call the header whatever you like. + +Given a sample.yml file of: +```yaml +- name: Gary + numberOfCats: 1 + likesApples: true + height: 168.8 +- name: Samantha's Rabbit + numberOfCats: 2 + likesApples: false + height: -188.8 + +``` +then +```bash +yq -o=csv '[["Name", "Number of Cats"]] + [.[] | [.name, .numberOfCats ]]' sample.yml +``` +will output +```csv +Name,Number of Cats +Gary,1 +Samantha's Rabbit,2 +``` + +## Encode array of objects to csv - generic +This is a little trickier than the previous example - we dynamically work out the $header, and use that to automatically create the value arrays. + +Given a sample.yml file of: +```yaml +- name: Gary + numberOfCats: 1 + likesApples: true + height: 168.8 +- name: Samantha's Rabbit + numberOfCats: 2 + likesApples: false + height: -188.8 + +``` +then +```bash +yq -o=csv '(.[0] | keys | .[] ) as $header | [[$header]] + [.[] | [ .[$header] ]]' sample.yml +``` +will output +```csv +name,numberOfCats,likesApples,height +Gary,1,true,168.8 +Samantha's Rabbit,2,false,-188.8 +``` + +## Parse CSV into an array of objects +First row is assumed to define the fields + +Given a sample.csv file of: +```csv +name,numberOfCats,likesApples,height +Gary,1,true,168.8 +Samantha's Rabbit,2,false,-188.8 + +``` +then +```bash +yq -p=csv sample.csv +``` +will output +```yaml +- name: Gary + numberOfCats: 1 + likesApples: true + height: 168.8 +- name: Samantha's Rabbit + numberOfCats: 2 + likesApples: false + height: -188.8 +``` + +## Parse TSV into an array of objects +First row is assumed to define the fields + +Given a sample.tsv file of: +```tsv +name numberOfCats likesApples height +Gary 1 true 168.8 +Samantha's Rabbit 2 false -188.8 + +``` +then +```bash +yq -p=tsv sample.tsv +``` +will output +```yaml +- name: Gary + numberOfCats: 1 + likesApples: true + height: 168.8 +- name: Samantha's Rabbit + numberOfCats: 2 + likesApples: false + height: -188.8 +``` + diff --git a/pkg/yqlib/operators_test.go b/pkg/yqlib/operators_test.go index 92b2caeb..0f9fa49d 100644 --- a/pkg/yqlib/operators_test.go +++ b/pkg/yqlib/operators_test.go @@ -31,7 +31,7 @@ type expressionScenario struct { } func TestMain(m *testing.M) { - logging.SetLevel(logging.ERROR, "") + logging.SetLevel(logging.DEBUG, "") Now = func() time.Time { return time.Date(2021, time.May, 19, 1, 2, 3, 4, time.UTC) }